From df9c2284c9bc2b5be943a21d993378d0e3dc226f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 8 Oct 2021 09:54:47 -0700 Subject: [PATCH 01/57] ENH/WIP/POC: EA-backed Index --- pandas/_libs/index.pyx | 473 ++++++++++++++++++ pandas/conftest.py | 8 + pandas/core/arrays/base.py | 19 + pandas/core/arrays/masked.py | 13 +- pandas/core/indexes/base.py | 35 +- pandas/core/indexes/interval.py | 7 +- pandas/tests/arithmetic/test_numeric.py | 4 +- .../arrays/categorical/test_constructors.py | 3 +- pandas/tests/arrays/integer/test_dtypes.py | 3 +- pandas/tests/base/test_value_counts.py | 11 + pandas/tests/extension/base/ea_index.py | 11 + pandas/tests/frame/test_reductions.py | 2 +- pandas/tests/groupby/test_function.py | 4 +- pandas/tests/groupby/test_quantile.py | 16 +- pandas/tests/indexes/common.py | 3 +- pandas/tests/indexes/test_numpy_compat.py | 22 +- pandas/tests/indexing/test_indexing.py | 3 + pandas/tests/series/methods/test_astype.py | 10 +- pandas/tests/strings/test_extract.py | 1 + 19 files changed, 613 insertions(+), 35 deletions(-) create mode 100644 pandas/tests/extension/base/ea_index.py diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index ea0bebea8299b..fc6de84631d72 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -34,6 +34,7 @@ from pandas._libs import ( ) from pandas._libs.missing cimport ( + C_NA as NA, checknull, is_matching_na, ) @@ -765,3 +766,475 @@ cdef class BaseMultiIndexCodesEngine: # Generated from template. include "index_class_helper.pxi" + + +@cython.freelist(32) +cdef class ExtensionEngine: + cdef readonly: + object values # ExtensionArray + bint over_size_threshold + + cdef: + bint unique, monotonic_inc, monotonic_dec + bint need_monotonic_check, need_unique_check + + def __init__(self, values: "ExtensionArray"): + self.values = values + + self.over_size_threshold = len(values) >= _SIZE_CUTOFF + self.need_unique_check = True + self.need_monotonic_check = True + self.need_unique_check = True + + def clear_mapping(self): + # for compat with IndexEngine + pass + + @property + def is_unique(self) -> bool: + if self.need_unique_check: + arr = self.values.unique() + self.unique = len(arr) == len(self.values) + + self.need_unique_check = False + return self.unique + + @property + def is_monotonic_increasing(self) -> bool: + if self.need_monotonic_check: + self._do_monotonic_check() + + return self.monotonic_inc == 1 + + @property + def is_monotonic_decreasing(self) -> bool: + if self.need_monotonic_check: + self._do_monotonic_check() + + return self.monotonic_dec == 1 + + cdef inline _do_monotonic_check(self): + cdef: + bint is_unique + + # FIXME: shouldn't depend on non-required _values_for_argsort + try: + self.monotonic_inc, self.monotonic_dec, is_unique = \ + self._call_monotonic(self.values._values_for_argsort()) + except TypeError: + self.monotonic_inc = 0 + self.monotonic_dec = 0 + is_unique = 0 + + self.need_monotonic_check = 0 + + # we can only be sure of uniqueness if is_unique=1 + if is_unique: + self.unique = 1 + self.need_unique_check = 0 + + cdef _call_monotonic(self, values): + return algos.is_monotonic(values, timelike=False) + + def __contains__(self, val: object) -> bool: + # We assume before we get here: + # - val is hashable + try: + self.get_loc(val) + return True + except KeyError: + return False + + cpdef get_loc(self, object val): + # -> Py_ssize_t | slice | ndarray[bool] + cdef: + Py_ssize_t loc + + if is_definitely_invalid_key(val): + raise TypeError(f"'{val}' is an invalid key") + + self._check_type(val) + + if self.over_size_threshold and self.is_monotonic_increasing: + if not self.is_unique: + return self._get_loc_duplicates(val) + + values = self.values + + loc = self._searchsorted_left(val) + if loc >= len(values): + raise KeyError(val) + if values[loc] != val: + raise KeyError(val) + return loc + + if not self.unique: + return self._get_loc_duplicates(val) + + return self._get_loc_duplicates(val) + + cdef Py_ssize_t _searchsorted_left(self, val) except? -1: + """ + See ObjectEngine._searchsorted_left.__doc__. + """ + try: + loc = self.values.searchsorted(val, side="left") + except TypeError as err: + # GH#35788 e.g. val=None with float64 values + raise KeyError(val) + return loc + + cdef inline _get_loc_duplicates(self, object val): + # -> Py_ssize_t | slice | ndarray[bool] + cdef: + Py_ssize_t diff + + if self.is_monotonic_increasing: + values = self.values + try: + left = values.searchsorted(val, side='left') + right = values.searchsorted(val, side='right') + except TypeError: + # e.g. GH#29189 get_loc(None) with a Float64Index + raise KeyError(val) + + diff = right - left + if diff == 0: + raise KeyError(val) + elif diff == 1: + return left + else: + return slice(left, right) + + return self._maybe_get_bool_indexer(val) + + cdef _get_bool_indexer(self, val): + if checknull(val): + return self.values.isna() # FIXME: need to check for *matching* NA + + return self.values == val + + cdef _maybe_get_bool_indexer(self, object val): + # Returns ndarray[bool] or int + cdef: + ndarray[uint8_t, ndim=1, cast=True] indexer + + indexer = _get_bool_indexer(self.values, val) + return _unpack_bool_indexer(indexer, val) + + def sizeof(self, deep: bool = False) -> int: + """ return the sizeof our mapping """ + return 0 + + def __sizeof__(self) -> int: + return self.sizeof() + + cdef _check_type(self, object val): + hash(val) + + def get_indexer(self, values: "ExtensionArray") -> np.ndarray: + # Note: we only get here with self.is_unique + cdef: + Py_ssize_t i, N = len(values) + + res = np.empty(N, dtype=np.intp) + + for i in range(N): + val = values[i] + try: + loc = self.get_loc(val) + # Because we are unique, loc should always be an integer + except KeyError: + loc = -1 + res[i] = loc + + return res + + def get_indexer_non_unique(self, targets: "ExtensionArray"): + """ + Return an indexer suitable for taking from a non unique index + return the labels in the same order as the target + and a missing indexer into the targets (which correspond + to the -1 indices in the results + + Returns + ------- + indexer : np.ndarray[np.intp] + missing : np.ndarray[np.intp] + """ + cdef: + Py_ssize_t i, N = len(targets) + + indexer = [] + missing = [] + + # See also IntervalIndex.get_indexer_pointwise + for i in range(N): + val = targets[i] + + try: + locs = self.get_loc(val) + except KeyError: + locs = np.array([-1], dtype=np.intp) + missing.append(i) + else: + if isinstance(locs, slice): + # Only needed for get_indexer_non_unique + locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp) + elif util.is_integer_object(locs): + locs = np.array([locs], dtype=np.intp) + else: + assert locs.dtype.kind == "b" + locs = locs.nonzero()[0] + + indexer.append(locs) + + indexer = np.concatenate(indexer, dtype=np.intp) + missing = np.array(missing, dtype=np.intp) + + return indexer, missing + + +@cython.freelist(32) +cdef class NullableEngine: + + cdef readonly: + ndarray _values, _mask + bint over_size_threshold + bint has_missing + object values # MaskedArray + + cdef: + bint unique, monotonic_inc, monotonic_dec + bint need_monotonic_check, need_unique_check + + def __init__(self, values: "MaskedArray"): + self.values = values + + self._values = values._data + self._mask = values._mask + + self.has_missing = values._mask.any() + + self.over_size_threshold = len(values) >= _SIZE_CUTOFF + self.need_unique_check = True + + def clear_mapping(self): + # for compat with IndexEngine + pass + + @property + def is_unique(self) -> bool: + if self.need_unique_check: + arr = self.values.unique() + self.unique = len(arr) == len(self.values) + + self.need_unique_check = False + return self.unique + + @property + def is_monotonic_increasing(self) -> bool: + if self.need_monotonic_check: + self._do_monotonic_check() + + return self.monotonic_inc == 1 + + @property + def is_monotonic_decreasing(self) -> bool: + if self.need_monotonic_check: + self._do_monotonic_check() + + return self.monotonic_dec == 1 + + cdef inline _do_monotonic_check(self): + cdef: + bint is_unique + + if self.has_missing: + self.monotonic_inc = 0 + self.monotonic_dec = 0 + self.need_monotonic_check = 0 + return + + # If there are no missing, then we can just look at self._values + try: + self.monotonic_inc, self.monotonic_dec, is_unique = \ + self._call_monotonic(self._values) + except TypeError: + self.monotonic_inc = 0 + self.monotonic_dec = 0 + is_unique = 0 + + self.need_monotonic_check = 0 + + # we can only be sure of uniqueness if is_unique=1 + if is_unique: + self.unique = 1 + self.need_unique_check = 0 + + cdef _call_monotonic(self, values): + return algos.is_monotonic(values, timelike=False) + + def __contains__(self, val: object) -> bool: + # We assume before we get here: + # - val is hashable + try: + self.get_loc(val) + return True + except KeyError: + return False + + cpdef get_loc(self, object val): + # -> Py_ssize_t | slice | ndarray[bool] + cdef: + Py_ssize_t loc + + if is_definitely_invalid_key(val): + raise TypeError(f"'{val}' is an invalid key") + + self._check_type(val) + + if self.over_size_threshold and self.is_monotonic_increasing: + if not self.is_unique: + return self._get_loc_duplicates(val) + + values = self.values + + loc = self._searchsorted_left(val) + if loc >= len(values): + raise KeyError(val) + if values[loc] != val: + raise KeyError(val) + return loc + + if not self.unique: + return self._get_loc_duplicates(val) + + return self._get_loc_duplicates(val) + + cdef Py_ssize_t _searchsorted_left(self, val) except? -1: + """ + See ObjectEngine._searchsorted_left.__doc__. + """ + try: + loc = self.values.searchsorted(val, side="left") + except TypeError as err: + # GH#35788 e.g. val=None with float64 values + raise KeyError(val) + return loc + + cdef inline _get_loc_duplicates(self, object val): + # -> Py_ssize_t | slice | ndarray[bool] + cdef: + Py_ssize_t diff + + if self.is_monotonic_increasing: + values = self.values + try: + left = values.searchsorted(val, side='left') + right = values.searchsorted(val, side='right') + except TypeError: + # e.g. GH#29189 get_loc(None) with a Float64Index + raise KeyError(val) + + diff = right - left + if diff == 0: + raise KeyError(val) + elif diff == 1: + return left + else: + return slice(left, right) + + return self._maybe_get_bool_indexer(val) + + cdef _get_bool_indexer(self, val): + if val is NA: + return self._mask + + if util.is_nan(val): + res = np.isnan(self._values) + else: + res = self._values == val + + res[self._mask] = False + return res + + cdef _maybe_get_bool_indexer(self, object val): + # Returns ndarray[bool] or int + cdef: + ndarray[uint8_t, ndim=1, cast=True] indexer + + indexer = self._get_bool_indexer(val) + return _unpack_bool_indexer(indexer, val) + + def sizeof(self, deep: bool = False) -> int: + """ return the sizeof our mapping """ + return 0 + + def __sizeof__(self) -> int: + return self.sizeof() + + cdef _check_type(self, object val): + hash(val) + + def get_indexer(self, values: "MaskedArray") -> np.ndarray: + # Note: we only get here with self.is_unique + cdef: + Py_ssize_t i, N = len(values) + + res = np.empty(N, dtype=np.intp) + + for i in range(N): + val = values[i] + try: + loc = self.get_loc(val) + # Because we are unique, loc should always be an integer + except KeyError: + loc = -1 + res[i] = loc + + return res + + def get_indexer_non_unique(self, targets: "MaskedArray"): + """ + Return an indexer suitable for taking from a non unique index + return the labels in the same order as the target + and a missing indexer into the targets (which correspond + to the -1 indices in the results + + Returns + ------- + indexer : np.ndarray[np.intp] + missing : np.ndarray[np.intp] + """ + cdef: + Py_ssize_t i, N = len(targets) + + indexer = [] + missing = [] + + # See also IntervalIndex.get_indexer_pointwise + for i in range(N): + val = targets[i] + + try: + locs = self.get_loc(val) + except KeyError: + locs = np.array([-1], dtype=np.intp) + missing.append(i) + else: + if isinstance(locs, slice): + # Only needed for get_indexer_non_unique + locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp) + elif util.is_integer_object(locs): + locs = np.array([locs], dtype=np.intp) + else: + assert locs.dtype.kind == "b" + locs = locs.nonzero()[0] + + indexer.append(locs) + + indexer = np.concatenate(indexer, dtype=np.intp) + missing = np.array(missing, dtype=np.intp) + + return indexer, missing diff --git a/pandas/conftest.py b/pandas/conftest.py index 44b805c632723..3428c6a65da79 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -498,6 +498,14 @@ def _create_mi_with_dt64tz_level(): "mi-with-dt64tz-level": _create_mi_with_dt64tz_level(), "multi": _create_multiindex(), "repeats": Index([0, 0, 1, 1, 2, 2]), + "nullable_int": Index(np.arange(100), dtype="Int64"), + "nullable_float": Index(np.arange(100), dtype="Float32"), + "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"), + "nullable_int-na": Index(np.arange(100), dtype="Int64").insert(1, pd.NA), + "nullable_float-na": Index(np.arange(100), dtype="Float32").insert(1, pd.NA), + "nullable_bool-na": Index(np.arange(100).astype(bool), dtype="boolean").insert( + 1, pd.NA + ), } diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 46b0a6873986e..91ed9466bfb69 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1348,6 +1348,25 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): # ------------------------------------------------------------------------ # Non-Optimized Default Methods + def putmask(self, mask: np.ndarray, value) -> None: + """ + Analogue to np.putmask(self, mask, value) + + Parameters + ---------- + mask : np.ndarray[bool] + value : scalar or listlike + + Raises + ------ + TypeError + If value cannot be inserted into self. + """ + if not is_list_like(value): + self[mask] = value + else: + self[mask] = value[mask] + def delete(self: ExtensionArrayT, loc: PositionalIndexer) -> ExtensionArrayT: indexer = np.delete(np.arange(len(self)), loc) return self.take(indexer) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 6a03456673604..3737719460431 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -564,7 +564,7 @@ def value_counts(self, dropna: bool = True) -> Series: # TODO(extension) # if we have allow Index to hold an ExtensionArray # this is easier - index = value_counts.index._values.astype(object) + index = value_counts.index # ._values.astype(object) # if we want nans, count the mask if dropna: @@ -574,10 +574,13 @@ def value_counts(self, dropna: bool = True) -> Series: counts[:-1] = value_counts counts[-1] = self._mask.sum() - index = Index( - np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]), - dtype=object, - ) + index = index.insert(-1, self.dtype.na_value) + # index = Index( + # np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]), + # dtype=object, + # ) + + index = index.astype(self.dtype) mask = np.zeros(len(counts), dtype="bool") counts = IntegerArray(counts, mask) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index da953fe46ef1d..d2d9c70404367 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -135,6 +135,7 @@ tz_to_dtype, validate_tz_from_dtype, ) +from pandas.core.arrays.masked import BaseMaskedArray from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import ( IndexOpsMixin, @@ -410,8 +411,9 @@ def __new__( validate_tz_from_dtype(dtype, tz) dtype = tz_to_dtype(tz) - if isinstance(data, PandasArray): - # ensure users don't accidentally put a PandasArray in an index. + if type(data) is PandasArray: + # ensure users don't accidentally put a PandasArray in an index, + # but don't unpack StringArray data = data.to_numpy() if isinstance(dtype, PandasDtype): dtype = dtype.numpy_dtype @@ -433,7 +435,6 @@ def __new__( ea_cls = dtype.construct_array_type() data = ea_cls._from_sequence(data, dtype=dtype, copy=copy) - data = np.asarray(data, dtype=object) disallow_kwargs(kwargs) return Index._simple_new(data, name=name) @@ -445,8 +446,8 @@ def __new__( return result.astype(dtype, copy=False) return result - data = np.array(data, dtype=object, copy=copy) disallow_kwargs(kwargs) + data = extract_array(data, extract_numpy=True) return Index._simple_new(data, name=name) # index-like @@ -640,7 +641,7 @@ def _simple_new(cls: type[_IndexT], values, name: Hashable = None) -> _IndexT: Must be careful not to recurse. """ - assert isinstance(values, np.ndarray), type(values) + assert isinstance(values, (np.ndarray, ExtensionArray)) result = object.__new__(cls) result._data = values @@ -656,6 +657,7 @@ def _with_infer(cls, *args, **kwargs): Constructor that uses the 1.0.x behavior inferring numeric dtypes for ndarray[object] inputs. """ + with warnings.catch_warnings(): warnings.filterwarnings("ignore", ".*the Index constructor", FutureWarning) result = cls(*args, **kwargs) @@ -811,6 +813,14 @@ def _cleanup(self) -> None: def _engine(self) -> libindex.IndexEngine: # For base class (object dtype) we get ObjectEngine + if isinstance(self._values, BaseMaskedArray): + return libindex.NullableEngine(self._values) + elif ( + isinstance(self._values, ExtensionArray) + and self._engine_type is libindex.ObjectEngine + ): + return libindex.ExtensionEngine(self._values) + # to avoid a reference cycle, bind `target_values` to a local variable, so # `self` is not passed into the lambda. target_values = self._get_engine_target() @@ -1024,9 +1034,15 @@ def take( # Note: we discard fill_value and use self._na_value, only relevant # in the case where allow_fill is True and fill_value is not None - taken = algos.take( - self._values, indices, allow_fill=allow_fill, fill_value=self._na_value - ) + values = self._values + if isinstance(values, np.ndarray): + taken = algos.take( + values, indices, allow_fill=allow_fill, fill_value=self._na_value + ) + else: + taken = values.take( + indices, allow_fill=allow_fill, fill_value=self._na_value + ) return type(self)._simple_new(taken, name=self.name) @final @@ -5012,6 +5028,9 @@ def equals(self, other: Any) -> bool: # d-level MultiIndex can equal d-tuple Index return other.equals(self) + if is_extension_array_dtype(self.dtype): + return self._values.equals(other._values) + if is_extension_array_dtype(other.dtype): # All EA-backed Index subclasses override equals return other.equals(self) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 165048e2a591a..bfef143ed187b 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -725,7 +725,12 @@ def _get_indexer_pointwise( if isinstance(locs, slice): # Only needed for get_indexer_non_unique locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") - locs = np.array(locs, ndmin=1) + elif lib.is_integer(locs): + locs = np.array(locs, ndmin=1) + else: + # FIXME: This is wrong; its boolean; not reached + assert locs.dtype.kind == "i" + except KeyError: missing.append(i) locs = np.array([-1]) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 2cdc35bdf51cb..1a55e65aee93c 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -1401,9 +1401,9 @@ def test_integer_array_add_list_like( if Series == box_pandas_1d_array: expected = Series(expected_data, dtype="Int64") elif Series == box_1d_array: - expected = Series(expected_data, dtype="object") + expected = Series(expected_data, dtype="Int64") elif Index in (box_pandas_1d_array, box_1d_array): - expected = Int64Index(expected_data) + expected = Index(expected_data, dtype="Int64") else: expected = np.array(expected_data, dtype="object") diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index ee24ecb4964ec..05ea29081cc7c 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -728,7 +728,8 @@ def test_categorical_extension_array_nullable(self, nulls_fixture): # GH: arr = pd.arrays.StringArray._from_sequence([nulls_fixture] * 2) result = Categorical(arr) - expected = Categorical(Series([pd.NA, pd.NA], dtype="object")) + assert arr.dtype == result.categories.dtype + expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype)) tm.assert_categorical_equal(result, expected) def test_from_sequence_copy(self): diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index f8b1ea2ebde23..c71ba7209cc44 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -72,7 +72,8 @@ def test_construct_index(all_data, dropna): other = all_data result = pd.Index(pd.array(other, dtype=all_data.dtype)) - expected = pd.Index(other, dtype=object) + expected = pd.Index(other, dtype=all_data.dtype) + assert all_data.dtype == expected.dtype # dont coerce to object tm.assert_index_equal(result, expected) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 10f391a49d98f..47cd29c49000c 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -35,10 +35,16 @@ def test_value_counts(index_or_series_obj): if isinstance(obj, pd.MultiIndex): expected.index = Index(expected.index) + if not isinstance(result.dtype, np.dtype): + # TODO: be more specific + # i.e IntegerDtype + expected = expected.astype(result.dtype) + # TODO: Order of entries with the same count is inconsistent on CI (gh-32449) if obj.duplicated().any(): result = result.sort_index() expected = expected.sort_index() + tm.assert_series_equal(result, expected) @@ -76,6 +82,11 @@ def test_value_counts_null(null_obj, index_or_series_obj): # Order of entries with the same count is inconsistent on CI (gh-32449) expected = expected.sort_index() result = result.sort_index() + + if not isinstance(result.dtype, np.dtype): + # TODO: be more specific + # i.e IntegerDtype + expected = expected.astype(result.dtype) tm.assert_series_equal(result, expected) # can't use expected[null_obj] = 3 as diff --git a/pandas/tests/extension/base/ea_index.py b/pandas/tests/extension/base/ea_index.py new file mode 100644 index 0000000000000..8309842f7134f --- /dev/null +++ b/pandas/tests/extension/base/ea_index.py @@ -0,0 +1,11 @@ +""" +Tests for Indexes backed by arbitrary ExtensionArrays. +""" +import pandas as pd +from pandas.tests.extension.base.base import BaseExtensionTests + + +class BaseExtensionIndexTests(BaseExtensionTests): + def test_index_from_array(self, data): + idx = pd.Index(data) + assert data.dtype == idx.dtype diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 258e4e6eb0cc9..d30d810703594 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1067,7 +1067,7 @@ def test_idxmax_idxmin_convert_dtypes(self, op, expected_value): result = getattr(df, op)() expected = DataFrame( {"value": expected_value}, - index=Index([100, 200], name="ID"), + index=Index([100, 200], name="ID", dtype="Int64"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 3ae11847cc06b..edc366d2df4f4 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1126,7 +1126,7 @@ def test_apply_to_nullable_integer_returns_float(values, function): # https://github.com/pandas-dev/pandas/issues/32219 output = 0.5 if function == "var" else 1.5 arr = np.array([output] * 3, dtype=float) - idx = Index([1, 2, 3], name="a") + idx = Index([1, 2, 3], name="a", dtype="Int64") expected = DataFrame({"b": arr}, index=idx).astype("Float64") groups = DataFrame(values, dtype="Int64").groupby("a") @@ -1146,7 +1146,7 @@ def test_groupby_sum_below_mincount_nullable_integer(): # https://github.com/pandas-dev/pandas/issues/32861 df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") grouped = df.groupby("a") - idx = Index([0, 1, 2], name="a") + idx = Index([0, 1, 2], name="a", dtype="Int64") result = grouped["b"].sum(min_count=2) expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index bcb2abeed75e4..1badc4aa7995a 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -251,14 +251,14 @@ def test_groupby_quantile_NA_float(any_float_dtype): # GH#42849 df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype) result = df.groupby("x")["y"].quantile(0.5) - expected = pd.Series([0.2], dtype=float, index=[1.0], name="y") - expected.index.name = "x" + exp_index = Index([1.0], dtype=any_float_dtype, name="x") + expected = pd.Series([0.2], dtype=float, index=exp_index, name="y") tm.assert_series_equal(expected, result) result = df.groupby("x")["y"].quantile([0.5, 0.75]) expected = pd.Series( [0.2] * 2, - index=pd.MultiIndex.from_product(([1.0], [0.5, 0.75]), names=["x", None]), + index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]), name="y", ) tm.assert_series_equal(result, expected) @@ -268,11 +268,13 @@ def test_groupby_quantile_NA_int(any_int_ea_dtype): # GH#42849 df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype) result = df.groupby("x")["y"].quantile(0.5) - expected = pd.Series([3.5], dtype=float, index=Index([1], name="x"), name="y") + expected = pd.Series( + [3.5], dtype=float, index=Index([1], name="x", dtype=any_int_ea_dtype), name="y" + ) tm.assert_series_equal(expected, result) result = df.groupby("x").quantile(0.5) - expected = DataFrame({"y": 3.5}, index=Index([1], name="x")) + expected = DataFrame({"y": 3.5}, index=Index([1], name="x", dtype=any_int_ea_dtype)) tm.assert_frame_equal(result, expected) @@ -281,7 +283,9 @@ def test_groupby_quantile_allNA_column(dtype): # GH#42849 df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype) result = df.groupby("x")["y"].quantile(0.5) - expected = pd.Series([np.nan], dtype=float, index=[1.0], name="y") + expected = pd.Series( + [np.nan], dtype=float, index=Index([1.0], dtype=dtype), name="y" + ) expected.index.name = "x" tm.assert_series_equal(expected, result) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index a8684ca4d3c25..6606bcd2e08f9 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -525,6 +525,7 @@ def test_format_empty(self): assert empty_idx.format() == [] assert empty_idx.format(name=True) == [""] + # TODO: doesn't belong in this class, gets re-run for every subclas def test_hasnans_isnans(self, index_flat): # GH 11343, added tests for hasnans / isnans index = index_flat @@ -547,7 +548,7 @@ def test_hasnans_isnans(self, index_flat): else: values[1] = np.nan - if isinstance(index, PeriodIndex): + if False: # isinstance(index, PeriodIndex): idx = type(index)(values, freq=index.freq) else: idx = type(index)(values) diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 9cc1205310ea7..c4a0936d8d633 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -10,6 +10,7 @@ ) import pandas._testing as tm from pandas.core.api import Float64Index +from pandas.core.arrays import BooleanArray from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin @@ -49,14 +50,21 @@ def test_numpy_ufuncs_basic(index, func): with tm.external_error_raised((TypeError, AttributeError)): with np.errstate(all="ignore"): func(index) - elif isinstance(index, NumericIndex): + elif isinstance(index, NumericIndex) or ( + not isinstance(index.dtype, np.dtype) and index.dtype._is_numeric + ): # coerces to float (e.g. np.sin) with np.errstate(all="ignore"): result = func(index) exp = Index(func(index.values), name=index.name) tm.assert_index_equal(result, exp) - assert isinstance(result, Float64Index) + if type(index) is not Index: + # i.e NumericIndex + assert isinstance(result, Float64Index) + else: + # e.g. np.exp with Int64 -> Float64 + assert type(result) is Index else: # raise AttributeError or TypeError if len(index) == 0: @@ -94,10 +102,16 @@ def test_numpy_ufuncs_other(index, func, request): with tm.external_error_raised(TypeError): func(index) - elif isinstance(index, NumericIndex): + elif isinstance(index, NumericIndex) or ( + not isinstance(index.dtype, np.dtype) and index.dtype._is_numeric + ): # Results in bool array result = func(index) - assert isinstance(result, np.ndarray) + if not isinstance(index.dtype, np.dtype): + # e.g. Int64 we expect to get BooleanArray back + assert isinstance(result, BooleanArray) + else: + assert isinstance(result, np.ndarray) assert not isinstance(result, Index) else: if len(index) == 0: diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 01407f1f9bae7..d5c1d6f1533d3 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -99,6 +99,9 @@ def test_getitem_ndarray_3d( msgs.append("Data must be 1-dimensional") if len(index) == 0 or isinstance(index, pd.MultiIndex): msgs.append("positional indexers are out-of-bounds") + if type(index) is Index and not isinstance(index._values, np.ndarray): + # e.g. Int64 + msgs.append("values must be a 1D array") msg = "|".join(msgs) potential_errors = (IndexError, ValueError, NotImplementedError) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 732d375d136d0..a9833e746ecb2 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -427,9 +427,13 @@ def test_astype_string_to_extension_dtype_roundtrip( ) request.node.add_marker(mark) # GH-40351 - s = Series(data, dtype=dtype) - result = s.astype(nullable_string_dtype).astype(dtype) - tm.assert_series_equal(result, s) + ser = Series(data, dtype=dtype) + + # Note: just passing .astype(dtype) fails for dtype="category" + # with bc ser.dtype.categories will be object dtype whereas + # result.dtype.categories will have string dtype + result = ser.astype(nullable_string_dtype).astype(ser.dtype) + tm.assert_series_equal(result, ser) class TestAstypeCategorical: diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index 16ec4a8c6831c..9d5380955acae 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -258,6 +258,7 @@ def test_extract_expand_True_single_capture_group(index_or_series, any_string_dt s_or_idx = index_or_series(["A1", "A2"], dtype=any_string_dtype) result = s_or_idx.str.extract(r"(?PA)\d", expand=True) expected_dtype = "object" if index_or_series is Index else any_string_dtype + expected_dtype = any_string_dtype expected = DataFrame({"uno": ["A", "A"]}, dtype=expected_dtype) tm.assert_frame_equal(result, expected) From 95e012971a604f1aa759334bc71a43c5a8d75a57 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 8 Oct 2021 12:30:30 -0700 Subject: [PATCH 02/57] BUG: NumericIndex.insert --- pandas/core/indexes/base.py | 7 ++++--- pandas/tests/indexes/common.py | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index da953fe46ef1d..2ff9b3973a526 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6329,10 +6329,11 @@ def insert(self, loc: int, item) -> Index: arr = np.asarray(self) - # Use Index constructor to ensure we get tuples cast correctly. - item = Index([item], dtype=self.dtype)._values + # Use constructor to ensure we get tuples cast correctly. + # Use self._constructor instead of Index to retain NumericIndex GH#43921 + item = self._constructor([item], dtype=self.dtype)._values idx = np.concatenate((arr[:loc], item, arr[loc:])) - return Index._with_infer(idx, name=self.name) + return self._constructor._with_infer(idx, name=self.name) def drop(self, labels, errors: str_t = "raise") -> Index: """ diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 8357595fdaa40..7e43664c6b3de 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -793,6 +793,20 @@ def test_format(self, simple_index): def test_numeric_compat(self): pass # override Base method + def test_insert_non_na(self, simple_index): + # GH#43921 inserting an element that we know we can hold should + # not change dtype or type (except for RangeIndex) + index = simple_index + + result = index.insert(0, index[0]) + + cls = type(index) + if cls is RangeIndex: + cls = Int64Index + + expected = cls([index[0]] + list(index), dtype=index.dtype) + tm.assert_index_equal(result, expected) + def test_insert_na(self, nulls_fixture, simple_index): # GH 18295 (test missing) index = simple_index @@ -800,6 +814,11 @@ def test_insert_na(self, nulls_fixture, simple_index): if na_val is pd.NaT: expected = Index([index[0], pd.NaT] + list(index[1:]), dtype=object) + elif type(index) is NumericIndex and index.dtype.kind == "f": + # GH#43921 + expected = NumericIndex( + [index[0], np.nan] + list(index[1:]), dtype=index.dtype + ) else: expected = Float64Index([index[0], np.nan] + list(index[1:])) From d53377d841d62860e6c216477304248ae8dbaba0 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 10 Oct 2021 13:21:43 -0700 Subject: [PATCH 03/57] fix a few more tests; ignoring linting for now --- pandas/core/indexes/base.py | 20 ++++++++++++++++++-- pandas/core/indexing.py | 2 +- pandas/tests/base/test_value_counts.py | 5 +++-- pandas/tests/strings/test_extract.py | 4 +--- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8456823cbacc6..19b44784c7f99 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6316,7 +6316,12 @@ def delete(self: _IndexT, loc) -> _IndexT: >>> idx.delete([0, 2]) Index(['b'], dtype='object') """ - res_values = np.delete(self._data, loc) + values = self._values + if isinstance(values, np.ndarray): + res_values = np.delete(values, loc) + else: + # TODO(__array_function__) special-casing unnecessary + res_values = values.delete(loc) return type(self)._simple_new(res_values, name=self.name) def insert(self, loc: int, item) -> Index: @@ -6346,7 +6351,18 @@ def insert(self, loc: int, item) -> Index: dtype = self._find_common_type_compat(item) return self.astype(dtype).insert(loc, item) - arr = np.asarray(self) + arr = self._values + if isinstance(arr, ExtensionArray): + # TODO: need EA.insert + try: + arr2 = type(arr)._from_sequence([item], dtype=arr.dtype) + except TypeError: + # TODO: make this into _validate_fill_value + dtype = self._find_common_type_compat(item) + return self.astype(dtype).insert(loc, item) + + res_values = arr._concat_same_type([arr[:loc], arr2, arr[loc:]]) + return type(self)._simple_new(res_values, name=self.name) # Use constructor to ensure we get tuples cast correctly. # Use self._constructor instead of Index to retain NumericIndex GH#43921 diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index bbb3cb3391dfa..31c1619fba4d8 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -964,7 +964,7 @@ def _validate_key(self, key, axis: int): # slice of labels (where start-end in labels) # slice of integers (only if in the labels) # boolean not in slice and with boolean index - if isinstance(key, bool) and not is_bool_dtype(self.obj.index): + if isinstance(key, bool) and not (is_bool_dtype(self.obj.index) or self.obj.index.dtype.name == "boolean"): raise KeyError( f"{key}: boolean label can not be used without a boolean index" ) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 47cd29c49000c..f88fe6fa34050 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -91,8 +91,9 @@ def test_value_counts_null(null_obj, index_or_series_obj): # can't use expected[null_obj] = 3 as # IntervalIndex doesn't allow assignment - new_entry = Series({np.nan: 3}, dtype=np.int64) - expected = expected.append(new_entry) + #new_entry = Series({np.nan: 3}, dtype=np.int64) + #expected = expected.append(new_entry) # TODO: test that both of these work with IntegerNAIndex + expected[null_obj] = 3 result = obj.value_counts(dropna=False) if obj.duplicated().any(): diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index 9d5380955acae..0f4ffccd8ad7f 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -257,9 +257,7 @@ def test_extract_expand_True_single_capture_group(index_or_series, any_string_dt # single group renames series/index properly s_or_idx = index_or_series(["A1", "A2"], dtype=any_string_dtype) result = s_or_idx.str.extract(r"(?PA)\d", expand=True) - expected_dtype = "object" if index_or_series is Index else any_string_dtype - expected_dtype = any_string_dtype - expected = DataFrame({"uno": ["A", "A"]}, dtype=expected_dtype) + expected = DataFrame({"uno": ["A", "A"]}, dtype=any_string_dtype) tm.assert_frame_equal(result, expected) From 1ed588a7acb7bbb9f23186a5f6fbce737c2eb941 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 11 Oct 2021 16:13:19 -0700 Subject: [PATCH 04/57] fix test --- pandas/tests/arithmetic/test_numeric.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 8f97ed8774264..9cd56b66ca47e 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -1402,14 +1402,23 @@ def test_integer_array_add_list_like( left = container + box_1d_array(data) right = box_1d_array(data) + container - if Series == box_pandas_1d_array: - expected = Series(expected_data, dtype="Int64") - elif Series == box_1d_array: - expected = Series(expected_data, dtype="Int64") - elif Index in (box_pandas_1d_array, box_1d_array): - expected = Index(expected_data, dtype="Int64") + if Series in [box_1d_array, box_pandas_1d_array]: + cls = Series + elif Index in [box_1d_array, box_pandas_1d_array]: + cls = Index else: - expected = np.array(expected_data, dtype="object") + cls = np.array + + if box_pandas_1d_array in [Index, Series]: + expected = cls(expected_data, dtype="Int64") + + elif box_1d_array == Index: + # tm.to_array casts to object, Index constructor does inference + expected = cls(expected_data, dtype="int64") + + else: + # tm.to_array casts to object, no inference + expected = cls(expected_data, dtype="object") tm.assert_equal(left, expected) tm.assert_equal(right, expected) From 34d5dde0895e12ad723bd9b962cb9960941de4dc Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 11 Oct 2021 16:56:57 -0700 Subject: [PATCH 05/57] down to 38 tests failing --- pandas/_libs/index.pyx | 38 +++++++++++++++++-- pandas/_libs/lib.pxd | 4 ++ pandas/_libs/lib.pyx | 20 ++++++++++ pandas/_testing/asserters.py | 10 ++++- pandas/conftest.py | 10 ++--- pandas/core/arrays/floating.py | 5 ++- pandas/core/arrays/masked.py | 3 ++ pandas/core/indexes/base.py | 4 ++ pandas/tests/arrays/floating/test_function.py | 20 ++++++---- pandas/tests/indexes/common.py | 9 ++++- 10 files changed, 103 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index fc6de84631d72..4fa371f116fc1 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -32,6 +32,7 @@ from pandas._libs import ( algos, hashtable as _hash, ) +from pandas._libs.lib cimport eq_NA_compat from pandas._libs.missing cimport ( C_NA as NA, @@ -63,7 +64,7 @@ cdef ndarray _get_bool_indexer(ndarray values, object val): if values.descr.type_num == cnp.NPY_OBJECT: # i.e. values.dtype == object if not checknull(val): - indexer = values == val + indexer = eq_NA_compat(values, val) else: # We need to check for _matching_ NA values @@ -910,7 +911,7 @@ cdef class ExtensionEngine: cdef _get_bool_indexer(self, val): if checknull(val): - return self.values.isna() # FIXME: need to check for *matching* NA + return self.values.isna().view("uint8") return self.values == val @@ -946,6 +947,8 @@ cdef class ExtensionEngine: # Because we are unique, loc should always be an integer except KeyError: loc = -1 + else: + assert util.is_integer_object(loc), (loc, val) res[i] = loc return res @@ -1092,6 +1095,13 @@ cdef class NullableEngine: if is_definitely_invalid_key(val): raise TypeError(f"'{val}' is an invalid key") + if val is NA: + # TODO: return copy? readonly view? + # TODO: do this later on to keep same pattern as IndexEngine? + if not self.has_missing: + raise KeyError(val) + return _unpack_bool_indexer(self._mask, val) + self._check_type(val) if self.over_size_threshold and self.is_monotonic_increasing: @@ -1149,7 +1159,10 @@ cdef class NullableEngine: cdef _get_bool_indexer(self, val): if val is NA: - return self._mask + #if not self.has_missing: + # raise KeyError(val) + # TODO: readonly? copy? + return self._mask.view("uint8") if util.is_nan(val): res = np.isnan(self._values) @@ -1175,7 +1188,21 @@ cdef class NullableEngine: return self.sizeof() cdef _check_type(self, object val): - hash(val) + kind = self._values.dtype.kind + if kind in ["i", "u"]: + if not util.is_integer_object(val): + raise KeyError(val) + if kind == "u": + if val < 0: + # cannot have negative values with unsigned int dtype + raise KeyError(val) + elif kind == "b": + if not util.is_bool_object(val): + raise KeyError(val) + else: + if not util.is_integer_object(val) and not util.is_float_object(val): + # in particular catch bool and avoid casting True -> 1.0 + raise KeyError(val) def get_indexer(self, values: "MaskedArray") -> np.ndarray: # Note: we only get here with self.is_unique @@ -1191,6 +1218,9 @@ cdef class NullableEngine: # Because we are unique, loc should always be an integer except KeyError: loc = -1 + else: + assert util.is_integer_object(loc), (loc, val) + res[i] = loc return res diff --git a/pandas/_libs/lib.pxd b/pandas/_libs/lib.pxd index b3c72c30a74de..1306960b403e2 100644 --- a/pandas/_libs/lib.pxd +++ b/pandas/_libs/lib.pxd @@ -1 +1,5 @@ +from numpy cimport ndarray + cdef bint c_is_list_like(object, bint) except -1 + +cpdef ndarray eq_NA_compat(ndarray[object] arr, object key) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e7f889ef39707..e1b821cd8c27f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -3028,3 +3028,23 @@ def is_bool_list(obj: list) -> bool: # Note: we return True for empty list return True + + +cpdef ndarray eq_NA_compat(ndarray[object] arr, object key): + cdef: + ndarray[uint8_t, cast=True] result = np.empty(len(arr), dtype=bool) + Py_ssize_t i + object item + + if key is C_NA: + for i in range(len(arr)): + item = arr[i] + result[i] = item is C_NA + else: + for i in range(len(arr)): + item = arr[i] + if item is C_NA: + result[i] = False + else: + result[i] = item == key # FIXME: compat for other NAs + return result diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index fc7e36dda4619..f95de4a206682 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -400,8 +400,16 @@ def _get_ilevel_values(index, level): # skip exact index checking when `check_categorical` is False if check_exact and check_categorical: if not left.equals(right): + mismatch = left._values != right._values + + if not isinstance(mismatch, np.ndarray): + # i.e. its a MaskedArray + mismatch = mismatch.to_numpy(dtype=int, na_value=0) + mismask = left._values._mask ^ right._values._mask + mismatch[mismask] = 1 + diff = ( - np.sum((left._values != right._values).astype(int)) * 100.0 / len(left) + np.sum(mismatch.astype(int)) * 100.0 / len(left) ) msg = f"{obj} values are different ({np.round(diff, 5)} %)" raise_assert_detail(obj, msg, left, right) diff --git a/pandas/conftest.py b/pandas/conftest.py index 3428c6a65da79..ab53628439011 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -501,11 +501,11 @@ def _create_mi_with_dt64tz_level(): "nullable_int": Index(np.arange(100), dtype="Int64"), "nullable_float": Index(np.arange(100), dtype="Float32"), "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"), - "nullable_int-na": Index(np.arange(100), dtype="Int64").insert(1, pd.NA), - "nullable_float-na": Index(np.arange(100), dtype="Float32").insert(1, pd.NA), - "nullable_bool-na": Index(np.arange(100).astype(bool), dtype="boolean").insert( - 1, pd.NA - ), + #"nullable_int-na": Index(np.arange(100), dtype="Int64").insert(1, pd.NA), + #"nullable_float-na": Index(np.arange(100), dtype="Float32").insert(1, pd.NA), + #"nullable_bool-na": Index(np.arange(100).astype(bool), dtype="boolean").insert( + # 1, pd.NA + #), } diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 066f6ebdfcaa6..a5cb8a419e4f5 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -108,6 +108,8 @@ def coerce_to_array( if dtype is None and hasattr(values, "dtype"): if is_float_dtype(values.dtype): dtype = values.dtype + if dtype == "float16": + raise TypeError("FloatingArray does not support float16 dtype") if dtype is not None: if isinstance(dtype, str) and dtype.startswith("Float"): @@ -254,7 +256,8 @@ def dtype(self) -> FloatingDtype: return FLOAT_STR_TO_DTYPE[str(self._data.dtype)] def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): - if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"): + if not (isinstance(values, np.ndarray) and values.dtype.kind == "f" and values.dtype.itemsize > 2): + # We do not support float16 raise TypeError( "values should be floating numpy array. Use " "the 'pd.array' function instead" diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 3737719460431..dbd022eac2f00 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -412,6 +412,9 @@ def reconstruct(x): m = mask.copy() return IntegerArray(x, m) elif is_float_dtype(x.dtype): + if x.dtype.itemsize <= 2: + # we don't support float16 + x = x.astype(np.float32) m = mask.copy() return FloatingArray(x, m) else: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8586c9925fd7e..dd8c323cc4cfb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -26,6 +26,7 @@ algos as libalgos, index as libindex, lib, + missing as libmissing, ) import pandas._libs.join as libjoin from pandas._libs.lib import ( @@ -821,6 +822,7 @@ def _engine(self) -> libindex.IndexEngine: ): return libindex.ExtensionEngine(self._values) + assert self.dtype != "boolean" # to avoid a reference cycle, bind `target_values` to a local variable, so # `self` is not passed into the lambda. target_values = self._get_engine_target() @@ -3587,6 +3589,7 @@ def get_indexer( indexer = self._engine.get_indexer(target.codes) if self.hasnans and target.hasnans: + #loc = self.get_loc(libmissing.NA) loc = self.get_loc(np.nan) mask = target.isna() indexer[mask] = loc @@ -3605,6 +3608,7 @@ def get_indexer( # Exclude MultiIndex because hasnans raises NotImplementedError # we should only get here if we are unique, so loc is an integer # GH#41934 + #loc = self.get_loc(libmissing.NA) loc = self.get_loc(np.nan) mask = target.isna() indexer[mask] = loc diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py index ef95eac316397..f96cbb8366564 100644 --- a/pandas/tests/arrays/floating/test_function.py +++ b/pandas/tests/arrays/floating/test_function.py @@ -97,26 +97,30 @@ def test_stat_method(pandasmethname, kwargs): def test_value_counts_na(): arr = pd.array([0.1, 0.2, 0.1, pd.NA], dtype="Float64") result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=[0.1, 0.2, pd.NA], dtype="Int64") + idx = pd.Index([0.1, 0.2, pd.NA], dtype=arr.dtype) + assert idx.dtype == arr.dtype + expected = pd.Series([2, 1, 1], index=idx, dtype="Int64") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Int64") + expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64") tm.assert_series_equal(result, expected) def test_value_counts_empty(): - s = pd.Series([], dtype="Float64") - result = s.value_counts() - idx = pd.Index([], dtype="object") + ser = pd.Series([], dtype="Float64") + result = ser.value_counts() + idx = pd.Index([], dtype="Float64") + assert idx.dtype == "Float64" expected = pd.Series([], index=idx, dtype="Int64") tm.assert_series_equal(result, expected) def test_value_counts_with_normalize(): - s = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64") - result = s.value_counts(normalize=True) - expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Float64") / 3 + ser = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64") + result = ser.value_counts(normalize=True) + expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3 + assert expected.index.dtype == ser.dtype tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 38127a0e255bd..5932c0ab878b8 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -296,6 +296,12 @@ def test_ensure_copied_data(self, index): elif isinstance(index, IntervalIndex): # checked in test_interval.py pass + elif type(index) is Index and not isinstance(index.dtype, np.dtype): + result = index_type(index.values, copy=False, **init_kwargs) + # FIXME: this is specific to MaskedArray + tm.assert_numpy_array_equal(index._values._data, result._values._data, check_same="same") + tm.assert_numpy_array_equal(index._values._mask, result._values._mask, check_same="same") + else: result = index_type(index.values, copy=False, **init_kwargs) tm.assert_numpy_array_equal(index.values, result.values, check_same="same") @@ -315,7 +321,8 @@ def test_memory_usage(self, index): # RangeIndex, IntervalIndex # don't have engines - if not isinstance(index, (RangeIndex, IntervalIndex)): + # Index[EA] has engine but it does not have a Hashtable .mapping + if not isinstance(index, (RangeIndex, IntervalIndex)) and not (type(index) is Index and not isinstance(index.dtype, np.dtype)): assert result2 > result if index.inferred_type == "object": From 544d9fea6ef8d30383e671880459681c4708fc57 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 12 Oct 2021 19:14:45 -0700 Subject: [PATCH 06/57] down to 15 tests failing --- pandas/_libs/testing.pyx | 16 ++++++++++---- pandas/core/arrays/floating.py | 2 ++ pandas/core/indexes/base.py | 2 +- pandas/tests/arrays/boolean/test_function.py | 13 +++++++----- pandas/tests/arrays/integer/test_function.py | 22 ++++++++++++-------- pandas/tests/indexes/common.py | 4 +++- pandas/tests/indexes/test_common.py | 4 +++- 7 files changed, 42 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index cfe9f40f12452..90c10be990ac2 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -13,6 +13,7 @@ from pandas._libs.util cimport ( is_real_number_object, ) +from pandas._libs.missing cimport is_matching_na from pandas.core.dtypes.common import is_dtype_equal from pandas.core.dtypes.missing import ( array_equivalent, @@ -174,11 +175,18 @@ cpdef assert_almost_equal(a, b, # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) - if isna(a) and isna(b): - # TODO: Should require same-dtype NA? - # nan / None comparison - return True + if isna(a): + if isna(b): + # TODO: Should require same-dtype NA? + # nan / None comparison + return True + + assert False, f"expected {a} but got {b}" + + elif isna(b): + assert False, f"expected {a} but got {b}" + # TODO: test for tm.assert_whatever with pd.NA that would raise here if a == b: # object comparison return True diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index a5cb8a419e4f5..1e2864121c92e 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -425,6 +425,8 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): return type(self)(result, mask, copy=False) + def isna(self): + return self._mask | np.isnan(self._data) _dtype_docstring = """ An ExtensionDtype for {dtype} data. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 832e633da07d9..57596024a8afd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -358,7 +358,7 @@ def _outer_indexer( _typ: str = "index" _data: ExtensionArray | np.ndarray - _data_cls: type[np.ndarray] | type[ExtensionArray] = np.ndarray + _data_cls: type[np.ndarray] | type[ExtensionArray] = (np.ndarray, ExtensionArray) _id: object | None = None _name: Hashable = None # MultiIndex.levels previously allowed setting the index name. We diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index d90655b6e2820..6871fbb1e29d2 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -77,18 +77,21 @@ def test_ufunc_reduce_raises(values): def test_value_counts_na(): arr = pd.array([True, False, pd.NA], dtype="boolean") result = arr.value_counts(dropna=False) - expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64") + expected = pd.Series([1, 1, 1], index=arr, dtype="Int64") + assert expected.index.dtype == arr.dtype tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([1, 1], index=[True, False], dtype="Int64") + expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64") + assert expected.index.dtype == arr.dtype tm.assert_series_equal(result, expected) def test_value_counts_with_normalize(): - s = pd.Series([True, False, pd.NA], dtype="boolean") - result = s.value_counts(normalize=True) - expected = pd.Series([1, 1], index=[True, False], dtype="Float64") / 2 + ser = pd.Series([True, False, pd.NA], dtype="boolean") + result = ser.value_counts(normalize=True) + expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64") / 2 + assert expected.index.dtype == "boolean" tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 6f53b44776900..306bac96da3d0 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -108,29 +108,33 @@ def test_stat_method(pandasmethname, kwargs): def test_value_counts_na(): arr = pd.array([1, 2, 1, pd.NA], dtype="Int64") result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64") + ex_index = pd.Index([1, 2, pd.NA], dtype="Int64") + assert ex_index.dtype == "Int64" + expected = pd.Series([2, 1, 1], index=ex_index, dtype="Int64") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=[1, 2], dtype="Int64") + expected = pd.Series([2, 1], index=arr[:2], dtype="Int64") + assert expected.index.dtype == arr.dtype tm.assert_series_equal(result, expected) def test_value_counts_empty(): # https://github.com/pandas-dev/pandas/issues/33317 - s = pd.Series([], dtype="Int64") - result = s.value_counts() - # TODO: The dtype of the index seems wrong (it's int64 for non-empty) - idx = pd.Index([], dtype="object") + ser = pd.Series([], dtype="Int64") + result = ser.value_counts() + idx = pd.Index([], dtype=ser.dtype) + assert idx.dtype == ser.dtype expected = pd.Series([], index=idx, dtype="Int64") tm.assert_series_equal(result, expected) def test_value_counts_with_normalize(): # GH 33172 - s = pd.Series([1, 2, 1, pd.NA], dtype="Int64") - result = s.value_counts(normalize=True) - expected = pd.Series([2, 1], index=[1, 2], dtype="Float64") / 3 + ser = pd.Series([1, 2, 1, pd.NA], dtype="Int64") + result = ser.value_counts(normalize=True) + expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3 + assert expected.index.dtype == ser.dtype tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 6de8bda1fde61..cd3bf547491f0 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -443,7 +443,9 @@ def test_equals(self, index): assert index.equals(index) assert index.equals(index.copy()) - assert index.equals(index.astype(object)) + if not (type(index) is Index and not isinstance(index.dtype, np.dtype)): + # doesn't hold for e.g. IntegerDtype + assert index.equals(index.astype(object)) assert not index.equals(list(index)) assert not index.equals(np.array(index)) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 604b68cfcc791..4e899112dd048 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -422,7 +422,9 @@ def test_sort_values_with_missing(index_with_missing, na_position): sorted_values = np.concatenate([[None] * missing_count, sorted_values]) else: sorted_values = np.concatenate([sorted_values, [None] * missing_count]) - expected = type(index_with_missing)(sorted_values) + + # Explicitly pass dtype needed for Index backed by EA e.g. IntegerArray + expected = type(index_with_missing)(sorted_values, dtype=index_with_missing.dtype) result = index_with_missing.sort_values(na_position=na_position) tm.assert_index_equal(result, expected) From 900978c4e50a69b457205240fcb730a6222d3778 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 15 Oct 2021 21:27:07 -0700 Subject: [PATCH 07/57] fix value_counts --- pandas/conftest.py | 1 + pandas/core/algorithms.py | 7 +++++-- pandas/core/arrays/masked.py | 2 +- pandas/core/dtypes/concat.py | 1 + pandas/core/indexes/base.py | 15 ++++++++++++++- 5 files changed, 22 insertions(+), 4 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index ab53628439011..9d26c2749b77e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -499,6 +499,7 @@ def _create_mi_with_dt64tz_level(): "multi": _create_multiindex(), "repeats": Index([0, 0, 1, 1, 2, 2]), "nullable_int": Index(np.arange(100), dtype="Int64"), + "nullable_uint": Index(np.arange(100), dtype="UInt16"), "nullable_float": Index(np.arange(100), dtype="Float32"), "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"), #"nullable_int-na": Index(np.arange(100), dtype="Int64").insert(1, pd.NA), diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 10a5932731e3b..9420745f2a284 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1830,12 +1830,15 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike: np.ndarray or ExtensionArray Containing the unsorted union of both arrays. """ + from pandas.core.dtypes.concat import concat_compat + indexer = [] l_count = value_counts(lvals, dropna=False) r_count = value_counts(rvals, dropna=False) l_count, r_count = l_count.align(r_count, fill_value=0) - unique_array = unique(np.append(lvals, rvals)) - if not isinstance(lvals, np.ndarray): + unique_array = unique(concat_compat([lvals, rvals])) + unique_array = ensure_wrapped_if_datetimelike(unique_array) + if False:#not isinstance(lvals, np.ndarray): # i.e. ExtensionArray # Note: we only get here with lvals.dtype == rvals.dtype # TODO: are there any cases where union won't be type/dtype preserving? diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index dbd022eac2f00..6cef5c5259dac 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -577,7 +577,7 @@ def value_counts(self, dropna: bool = True) -> Series: counts[:-1] = value_counts counts[-1] = self._mask.sum() - index = index.insert(-1, self.dtype.na_value) + index = index.insert(len(index), self.dtype.na_value) # index = Index( # np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]), # dtype=object, diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index c7fce9fff3631..23b76983c6789 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -22,6 +22,7 @@ ABCSeries, ) +# TODO: avoid these imports so we can import from this file in core.algorithms from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseArray from pandas.core.construction import ( diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 57596024a8afd..45f399f4631f4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5889,7 +5889,20 @@ def map(self, mapper, na_action=None): new_values, dtype=dtype, copy=False, name=self.name ) - return Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name) + result = Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name) + + if type(self) is Index and not isinstance(self.dtype, np.dtype): + # TODO: what about "integer-na" + if self.dtype.kind in ["i", "u"] and result.inferred_type == "integer": + # TODO: worry about itemsize/overflows? + result = result.astype(self.dtype, copy=False) + elif self.dtype.kind == "f" and result.inferred_type == "floating": + # TODO: worry about itemsize/overflows? + result = result.astype(self.dtype, copy=False) + elif self.dtype == "boolean" and result.inferred_type == "boolean": + result = result.astype(self.dtype, copy=False) + + return result # TODO: De-duplicate with map, xref GH#32349 @final From c0ae18c808ee8c75b25da00da036089e82a55a51 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 18 Oct 2021 08:34:08 -0700 Subject: [PATCH 08/57] fix map test --- pandas/tests/indexes/test_base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index cbcb00a4230cc..9abf1abbc4365 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -723,6 +723,9 @@ def test_map_dictlike(self, index, mapper): else: exp_dtype = np.int64 expected = index._constructor(np.arange(len(index), 0, -1), dtype=exp_dtype) + elif type(index) is Index and index.dtype != object: + # i.e. EA-backed, for now just Nullable + expected = Index(np.arange(len(index), 0, -1), dtype=index.dtype) else: expected = Index(np.arange(len(index), 0, -1)) From a9ef37ec1c43cb95238d31dc26a5d01a3b136ee3 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 21 Oct 2021 12:23:53 -0700 Subject: [PATCH 09/57] fix some tests --- pandas/_libs/index.pyx | 12 +++++++---- pandas/_libs/testing.pyx | 23 +++++++++++----------- pandas/conftest.py | 9 ++++----- pandas/core/algorithms.py | 6 +----- pandas/core/arrays/floating.py | 7 ++++++- pandas/core/arrays/masked.py | 1 + pandas/core/arrays/string_.py | 4 +++- pandas/core/arrays/string_arrow.py | 3 +-- pandas/core/indexes/base.py | 21 ++++++++++++++------ pandas/tests/arrays/string_/test_string.py | 10 +++++----- pandas/tests/indexes/test_any_index.py | 13 +++++++++--- 11 files changed, 66 insertions(+), 43 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 9914800f5a87a..9bd93c50ef40c 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -32,8 +32,8 @@ from pandas._libs import ( algos, hashtable as _hash, ) -from pandas._libs.lib cimport eq_NA_compat +from pandas._libs.lib cimport eq_NA_compat from pandas._libs.missing cimport ( C_NA as NA, checknull, @@ -934,18 +934,22 @@ cdef class ExtensionEngine: return self._maybe_get_bool_indexer(val) - cdef _get_bool_indexer(self, val): + cdef ndarray _get_bool_indexer(self, val): if checknull(val): return self.values.isna().view("uint8") - return self.values == val + try: + return self.values == val + except TypeError: + # e.g. if __eq__ returns a BooleanArray instead of ndarry[bool] + return (self.values == val).to_numpy(dtype=bool, na_value=False) cdef _maybe_get_bool_indexer(self, object val): # Returns ndarray[bool] or int cdef: ndarray[uint8_t, ndim=1, cast=True] indexer - indexer = _get_bool_indexer(self.values, val) + indexer = self._get_bool_indexer(val) return _unpack_bool_indexer(indexer, val) def sizeof(self, deep: bool = False) -> int: diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 90c10be990ac2..d2566123815f1 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -7,13 +7,13 @@ from numpy cimport import_array import_array() +from pandas._libs.missing cimport is_matching_na from pandas._libs.util cimport ( is_array, is_complex_object, is_real_number_object, ) -from pandas._libs.missing cimport is_matching_na from pandas.core.dtypes.common import is_dtype_equal from pandas.core.dtypes.missing import ( array_equivalent, @@ -175,16 +175,17 @@ cpdef assert_almost_equal(a, b, # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) - if isna(a): - if isna(b): - # TODO: Should require same-dtype NA? - # nan / None comparison - return True - - assert False, f"expected {a} but got {b}" - - elif isna(b): - assert False, f"expected {a} but got {b}" + if isna(a) and isna(b): + return True + #if isna(b): + # # TODO: Should require same-dtype NA? + # # nan / None comparison + # return True + # + #assert False, f"expected {a} but got {b}" + + #elif isna(b): + # assert False, f"expected {a} but got {b}" # TODO: test for tm.assert_whatever with pd.NA that would raise here if a == b: diff --git a/pandas/conftest.py b/pandas/conftest.py index c66f386015685..ef48dc0055615 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -511,15 +511,14 @@ def _create_mi_with_dt64tz_level(): "mi-with-dt64tz-level": _create_mi_with_dt64tz_level(), "multi": _create_multiindex(), "repeats": Index([0, 0, 1, 1, 2, 2]), + # TODO: make sure we have cases that also have NA values + # (not allowed in this fixture) "nullable_int": Index(np.arange(100), dtype="Int64"), "nullable_uint": Index(np.arange(100), dtype="UInt16"), "nullable_float": Index(np.arange(100), dtype="Float32"), "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"), - #"nullable_int-na": Index(np.arange(100), dtype="Int64").insert(1, pd.NA), - #"nullable_float-na": Index(np.arange(100), dtype="Float32").insert(1, pd.NA), - #"nullable_bool-na": Index(np.arange(100).astype(bool), dtype="boolean").insert( - # 1, pd.NA - #), + "string-python": Index(pd.array(tm.makeStringIndex(100), dtype="string[python]")), + "string-pyarrow": Index(pd.array(tm.makeStringIndex(100), dtype="string[pyarrow]")), } diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 45ea5fc2716e4..3740667a2efff 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1843,11 +1843,7 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike: l_count, r_count = l_count.align(r_count, fill_value=0) unique_array = unique(concat_compat([lvals, rvals])) unique_array = ensure_wrapped_if_datetimelike(unique_array) - if False:#not isinstance(lvals, np.ndarray): - # i.e. ExtensionArray - # Note: we only get here with lvals.dtype == rvals.dtype - # TODO: are there any cases where union won't be type/dtype preserving? - unique_array = type(lvals)._from_sequence(unique_array, dtype=lvals.dtype) + for i, value in enumerate(unique_array): indexer += [i] * int(max(l_count[value], r_count[value])) return unique_array.take(indexer) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 9e03489d4343e..1dee05954d8d5 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -256,7 +256,11 @@ def dtype(self) -> FloatingDtype: return FLOAT_STR_TO_DTYPE[str(self._data.dtype)] def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): - if not (isinstance(values, np.ndarray) and values.dtype.kind == "f" and values.dtype.itemsize > 2): + if not ( + isinstance(values, np.ndarray) + and values.dtype.kind == "f" + and values.dtype.itemsize > 2 + ): # We do not support float16 raise TypeError( "values should be floating numpy array. Use " @@ -428,6 +432,7 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): def isna(self): return self._mask | np.isnan(self._data) + _dtype_docstring = """ An ExtensionDtype for {dtype} data. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 877a5f24367e4..51429a4fd0418 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -453,6 +453,7 @@ def reconstruct(x): return IntegerArray(x, m) elif is_float_dtype(x.dtype): if x.dtype.itemsize <= 2: + # reached in e.g. np.sqrt on BooleanArray # we don't support float16 x = x.astype(np.float32) m = mask.copy() diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index d93fa4bbdd7fc..00281bc8a7101 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -474,7 +474,9 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: def value_counts(self, dropna: bool = True): from pandas import value_counts - return value_counts(self._ndarray, dropna=dropna).astype("Int64") + result = value_counts(self._ndarray, dropna=dropna).astype("Int64") + result.index = result.index.astype(self.dtype) + return result def memory_usage(self, deep: bool = False) -> int: result = self._ndarray.nbytes diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index c7d08f7873c09..ccd3e063c48a6 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -668,8 +668,7 @@ def value_counts(self, dropna: bool = True) -> Series: # No missing values so we can adhere to the interface and return a numpy array. counts = np.array(counts) - # Index cannot hold ExtensionArrays yet - index = Index(type(self)(values)).astype(object) + index = Index(type(self)(values)) return Series(counts, index=index).astype("Int64") diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b6f0ef937f8ba..cb426a2e408c2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -69,6 +69,7 @@ can_hold_element, find_common_type, infer_dtype_from, + maybe_cast_pointwise_result, validate_numeric_casting, ) from pandas.core.dtypes.common import ( @@ -1082,6 +1083,7 @@ def take( values, indices, allow_fill=allow_fill, fill_value=self._na_value ) else: + # algos.take passes 'axis' keyword which not all EAs accept taken = values.take( indices, allow_fill=allow_fill, fill_value=self._na_value ) @@ -2576,8 +2578,12 @@ def __reduce__(self): # -------------------------------------------------------------------- # Null Handling Methods - _na_value: float | NaTType = np.nan - """The expected NA value to use with this index.""" + @cache_readonly + def _na_value(self): + """The expected NA value to use with this index.""" + if isinstance(self.dtype, np.dtype): + return np.nan + return self.dtype.na_value @cache_readonly def _isnan(self) -> npt.NDArray[np.bool_]: @@ -3634,7 +3640,7 @@ def get_indexer( indexer = self._engine.get_indexer(target.codes) if self.hasnans and target.hasnans: - #loc = self.get_loc(libmissing.NA) + # loc = self.get_loc(libmissing.NA) loc = self.get_loc(np.nan) mask = target.isna() indexer[mask] = loc @@ -3653,7 +3659,7 @@ def get_indexer( # Exclude MultiIndex because hasnans raises NotImplementedError # we should only get here if we are unique, so loc is an integer # GH#41934 - #loc = self.get_loc(libmissing.NA) + # loc = self.get_loc(libmissing.NA) loc = self.get_loc(np.nan) mask = target.isna() indexer[mask] = loc @@ -5981,9 +5987,12 @@ def map(self, mapper, na_action=None): new_values, dtype=dtype, copy=False, name=self.name ) - result = Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name) + res_values = maybe_cast_pointwise_result( + new_values, self.dtype, same_dtype=True + ) + result = Index._with_infer(res_values, dtype=dtype, copy=False, name=self.name) - if type(self) is Index and not isinstance(self.dtype, np.dtype): + if False: # type(self) is Index and not isinstance(self.dtype, np.dtype): # TODO: what about "integer-na" if self.dtype.kind in ["i", "u"] and result.inferred_type == "integer": # TODO: worry about itemsize/overflows? diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index fa564ac76f8bb..e540ed248a57e 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -466,18 +466,18 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2): def test_value_counts_na(dtype): arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64") + expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype="Int64") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64") + expected = pd.Series([2, 1], index=arr[:2], dtype="Int64") tm.assert_series_equal(result, expected) def test_value_counts_with_normalize(dtype): - s = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) - result = s.value_counts(normalize=True) - expected = pd.Series([2, 1], index=["a", "b"], dtype="Float64") / 3 + ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) + result = ser.value_counts(normalize=True) + expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3 tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index f68bde2188e67..3f85cd59bff76 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -139,9 +139,16 @@ def test_slice_keeps_name(self, index): # FutureWarning from non-tuple sequence of nd indexing @pytest.mark.filterwarnings("ignore::FutureWarning") def test_getitem_error(self, index, item): - msg = r"index 101 is out of bounds for axis 0 with size [\d]+|" + re.escape( - "only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) " - "and integer or boolean arrays are valid indices" + msg = "|".join( + [ + r"index 101 is out of bounds for axis 0 with size [\d]+", + re.escape( + "only integers, slices (`:`), ellipsis (`...`), " + "numpy.newaxis (`None`) and integer or boolean arrays " + "are valid indices" + ), + "index out of bounds", # string[pyarrow] + ] ) with pytest.raises(IndexError, match=msg): index[item] From 41acf3f9390d55413416b60218a4c62a509eb511 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 21 Oct 2021 19:00:32 -0700 Subject: [PATCH 10/57] ENH: ExtensionArray.insert --- doc/source/reference/extensions.rst | 1 + pandas/core/arrays/_mixins.py | 3 ++ pandas/core/arrays/base.py | 30 ++++++++++++++++++ pandas/tests/extension/base/methods.py | 42 ++++++++++++++++++++++++++ pandas/tests/extension/conftest.py | 12 ++++++++ pandas/tests/extension/test_numpy.py | 12 ++++++++ pandas/tests/extension/test_string.py | 7 +++++ pandas/util/_validators.py | 23 +++++++++++++- 8 files changed, 129 insertions(+), 1 deletion(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index e2e8c94ef8fc6..ce8d8d5c2ca10 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -48,6 +48,7 @@ objects. api.extensions.ExtensionArray.equals api.extensions.ExtensionArray.factorize api.extensions.ExtensionArray.fillna + api.extensions.ExtensionArray.insert api.extensions.ExtensionArray.isin api.extensions.ExtensionArray.isna api.extensions.ExtensionArray.ravel diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 3769c686da029..cf9820c3aa8f8 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -31,6 +31,7 @@ from pandas.util._validators import ( validate_bool_kwarg, validate_fillna_kwargs, + validate_insert_loc, ) from pandas.core.dtypes.common import is_dtype_equal @@ -359,6 +360,8 @@ def insert( ------- type(self) """ + loc = validate_insert_loc(loc, len(self)) + code = self._validate_scalar(item) new_vals = np.concatenate( diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index bf54f7166e14d..9b25a1b5abccd 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -47,6 +47,7 @@ from pandas.util._validators import ( validate_bool_kwarg, validate_fillna_kwargs, + validate_insert_loc, ) from pandas.core.dtypes.cast import maybe_cast_to_extension_array @@ -123,6 +124,7 @@ class ExtensionArray: factorize fillna equals + insert isin isna ravel @@ -1388,6 +1390,34 @@ def delete(self: ExtensionArrayT, loc: PositionalIndexer) -> ExtensionArrayT: indexer = np.delete(np.arange(len(self)), loc) return self.take(indexer) + def insert(self: ExtensionArrayT, loc: int, item) -> ExtensionArrayT: + """ + Insert an item at the given position. + + Parameters + ---------- + loc : int + item : scalar-like + + Returns + ------- + same type as self + + Notes + ----- + This method should be both type and dtype-preserving. If the item + cannot be held in an array of this type/dtype, either ValueError or + TypeError should be raised. + + The default implementation relies on _from_sequence to raise on invalid + items. + """ + loc = validate_insert_loc(loc, len(self)) + + item_arr = type(self)._from_sequence([item], dtype=self.dtype) + + return type(self)._concat_same_type([self[:loc], item_arr, self[loc:]]) + @classmethod def _empty(cls, shape: Shape, dtype: ExtensionDtype): """ diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index d390d4b5d8143..c96e2fb49e397 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -511,6 +511,48 @@ def test_delete(self, data): expected = data._concat_same_type([data[[0]], data[[2]], data[4:]]) self.assert_extension_array_equal(result, expected) + def test_insert(self, data): + # insert at the beginning + result = data[1:].insert(0, data[0]) + self.assert_extension_array_equal(result, data) + + result = data[1:].insert(-len(data[1:]), data[0]) + self.assert_extension_array_equal(result, data) + + # insert at the middle + result = data[:-1].insert(4, data[-1]) + + taker = np.arange(len(data)) + taker[5:] = taker[4:-1] + taker[4] = len(data) - 1 + expected = data.take(taker) + self.assert_extension_array_equal(result, expected) + + def test_insert_invalid(self, data, invalid_scalar): + item = invalid_scalar + + with pytest.raises((TypeError, ValueError)): + data.insert(0, item) + + with pytest.raises((TypeError, ValueError)): + data.insert(4, item) + + with pytest.raises((TypeError, ValueError)): + data.insert(len(data) - 1, item) + + def test_insert_invalid_loc(self, data): + ub = len(data) + + with pytest.raises(IndexError): + data.insert(ub + 1, data[0]) + + with pytest.raises(IndexError): + data.insert(-ub - 1, data[0]) + + with pytest.raises(TypeError): + # we expect TypeError here instead of IndexError to match np.insert + data.insert(1.5, data[0]) + @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame]) def test_equals(self, data, na_value, as_series, box): data2 = type(data)._from_sequence([data[0]] * len(data), dtype=data.dtype) diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 1942d737780da..3827ba234cfd8 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -181,3 +181,15 @@ def as_array(request): Boolean fixture to support ExtensionDtype _from_sequence method testing. """ return request.param + + +@pytest.fixture +def invalid_scalar(data): + """ + A scalar that *cannot* be held by this ExtensionArray. + + The default should work for most subclasses, but is not guaranteed. + + If the array can hold any item (i.e. object dtype), then use pytest.skip. + """ + return object.__new__(object) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 7be776819e399..0e3e26e7e9500 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -265,6 +265,18 @@ def test_searchsorted(self, data_for_sorting, as_series): def test_diff(self, data, periods): return super().test_diff(data, periods) + def test_insert(self, data, request): + if data.dtype.numpy_dtype == object: + mark = pytest.mark.xfail(reason="Dimension mismatch in np.concatenate") + request.node.add_marker(mark) + + super().test_insert(data) + + @skip_nested + def test_insert_invalid(self, data, invalid_scalar): + # PandasArray[object] can hold anything, so skip + super().test_insert_invalid(data, invalid_scalar) + class TestArithmetics(BaseNumPyTests, base.BaseArithmeticOpsTests): divmod_exc = None diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index af86c359c4c00..06b07968f949e 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -160,6 +160,13 @@ def test_value_counts(self, all_data, dropna): def test_value_counts_with_normalize(self, data): pass + def test_insert_invalid(self, data, invalid_scalar, request): + if data.dtype.storage == "pyarrow": + mark = pytest.mark.xfail(reason="casts invalid_scalar to string") + request.node.add_marker(mark) + + super().test_insert_invalid(data, invalid_scalar) + class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 7e03e3ceea11d..f8bd1ec7bc96a 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -12,7 +12,10 @@ import numpy as np -from pandas.core.dtypes.common import is_bool +from pandas.core.dtypes.common import ( + is_bool, + is_integer, +) def _check_arg_length(fname, args, max_fname_arg_count, compat_args): @@ -494,3 +497,21 @@ def validate_inclusive(inclusive: str | None) -> tuple[bool, bool]: ) return left_right_inclusive + + +def validate_insert_loc(loc: int, length: int) -> int: + """ + Check that we have an integer between -length and length, inclusive. + + Standardize negative loc to within [0, length]. + + The exceptions we raise on failure match np.insert. + """ + if not is_integer(loc): + raise TypeError(f"loc must be an integer between -{length} and {length}") + + if loc < 0: + loc += length + if not 0 <= loc <= length: + raise IndexError(f"loc must be an integer between -{length} and {length}") + return loc From 37d36adfad243148f7d55f5dfa9218e7221faeb3 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 21 Oct 2021 19:43:20 -0700 Subject: [PATCH 11/57] Fix usage --- pandas/tests/arithmetic/test_interval.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/arithmetic/test_interval.py b/pandas/tests/arithmetic/test_interval.py index 12220e825aed4..88b26dcc4d707 100644 --- a/pandas/tests/arithmetic/test_interval.py +++ b/pandas/tests/arithmetic/test_interval.py @@ -28,16 +28,16 @@ (Index([0, 2, 4, 4]), Index([1, 3, 5, 8])), (Index([0.0, 1.0, 2.0, np.nan]), Index([1.0, 2.0, 3.0, np.nan])), ( - timedelta_range("0 days", periods=3).insert(4, pd.NaT), - timedelta_range("1 day", periods=3).insert(4, pd.NaT), + timedelta_range("0 days", periods=3).insert(3, pd.NaT), + timedelta_range("1 day", periods=3).insert(3, pd.NaT), ), ( - date_range("20170101", periods=3).insert(4, pd.NaT), - date_range("20170102", periods=3).insert(4, pd.NaT), + date_range("20170101", periods=3).insert(3, pd.NaT), + date_range("20170102", periods=3).insert(3, pd.NaT), ), ( - date_range("20170101", periods=3, tz="US/Eastern").insert(4, pd.NaT), - date_range("20170102", periods=3, tz="US/Eastern").insert(4, pd.NaT), + date_range("20170101", periods=3, tz="US/Eastern").insert(3, pd.NaT), + date_range("20170102", periods=3, tz="US/Eastern").insert(3, pd.NaT), ), ], ids=lambda x: str(x[0].dtype), From bafb23f57ec15a9ad0c1818ecfd428a4c7a935de Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 21 Oct 2021 22:05:25 -0700 Subject: [PATCH 12/57] Fix TimedeltaIndex.insert test --- pandas/tests/indexes/timedeltas/methods/test_insert.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexes/timedeltas/methods/test_insert.py b/pandas/tests/indexes/timedeltas/methods/test_insert.py index 809d21db805e0..3af4b6b47fa2f 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_insert.py +++ b/pandas/tests/indexes/timedeltas/methods/test_insert.py @@ -136,8 +136,8 @@ def test_insert_empty(self): result = idx[:0].insert(0, td) assert result.freq == "D" - result = idx[:0].insert(1, td) - assert result.freq == "D" + with pytest.raises(IndexError, match="loc must be an integer between"): + result = idx[:0].insert(1, td) - result = idx[:0].insert(-1, td) - assert result.freq == "D" + with pytest.raises(IndexError, match="loc must be an integer between"): + result = idx[:0].insert(-1, td) From a3a349de719b060aa2ebd03b140ed5d2c4468c1d Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 22 Oct 2021 20:51:16 -0700 Subject: [PATCH 13/57] pass a few more tests --- pandas/_libs/index.pyx | 9 +++++- pandas/core/algorithms.py | 2 -- pandas/core/arrays/sparse/array.py | 4 +-- pandas/core/arrays/string_arrow.py | 5 +++ pandas/core/indexes/base.py | 44 ++++++++++++++------------ pandas/core/indexes/extension.py | 25 --------------- pandas/tests/base/test_misc.py | 3 ++ pandas/tests/extension/base/getitem.py | 9 ++++++ pandas/tests/indexes/common.py | 30 +++++++++++++++--- pandas/tests/indexes/test_any_index.py | 2 ++ pandas/tests/indexing/test_indexing.py | 4 +++ 11 files changed, 82 insertions(+), 55 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 9bd93c50ef40c..2222bfd61664f 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -942,7 +942,14 @@ cdef class ExtensionEngine: return self.values == val except TypeError: # e.g. if __eq__ returns a BooleanArray instead of ndarry[bool] - return (self.values == val).to_numpy(dtype=bool, na_value=False) + try: + return (self.values == val).to_numpy(dtype=bool, na_value=False) + except (TypeError, AttributeError) as err: + # e.g. (self.values == val) returned a bool + # see test_get_loc_generator[string[pyarrow]] + # e.g. self.value == val raises TypeError bc generator has no len + # see test_get_loc_generator[string[python]] + raise KeyError from err cdef _maybe_get_bool_indexer(self, object val): # Returns ndarray[bool] or int diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2c028ffc8c471..c1b587ce3a6b2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1840,8 +1840,6 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike: ----- Caller is responsible for ensuring lvals.dtype == rvals.dtype. """ - from pandas.core.dtypes.concat import concat_compat - indexer = [] l_count = value_counts(lvals, dropna=False) r_count = value_counts(rvals, dropna=False) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 87fcf54ed684b..4d48fcfefd65c 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -942,10 +942,10 @@ def __getitem__( # mypy doesn't know we have an array here key = cast(np.ndarray, key) return self.take(np.arange(len(key), dtype=np.int32)[key]) - elif hasattr(key, "__len__"): + elif lib.is_list_like(key): return self.take(key) else: - raise ValueError(f"Cannot slice with '{key}'") + raise IndexError(f"Cannot slice with '{key}'") return type(self)(data_slice, kind=self.kind) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ccd3e063c48a6..6858b55f6d3ef 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -324,6 +324,11 @@ def __getitem__( item = item[1] elif item[1] is Ellipsis: item = item[0] + elif not (is_integer(item) or isinstance(item, slice) or item is Ellipsis): + raise IndexError( + "Only integers, slices and integer or " + "boolean arrays are valid indices." + ) # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 16dd757d20b5b..c57ff638acef0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4831,6 +4831,13 @@ def _validate_fill_value(self, value): TypeError If the value cannot be inserted into an array of this dtype. """ + #if type(self) is Index and self.dtype != object: + # # FIXME: kludge; work this into can_hold_element? + # try: + # type(self._values)._from_sequence([value], dtype=self.dtype) + # except ValueError as err: + # raise TypeError from err + # return value if not can_hold_element(self._values, value): raise TypeError return value @@ -5986,12 +5993,12 @@ def map(self, mapper, na_action=None): new_values, dtype=dtype, copy=False, name=self.name ) - res_values = maybe_cast_pointwise_result( - new_values, self.dtype, same_dtype=True - ) - result = Index._with_infer(res_values, dtype=dtype, copy=False, name=self.name) + #res_values = maybe_cast_pointwise_result( + # new_values, self.dtype, same_dtype=True + #) + result = Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name) - if False: # type(self) is Index and not isinstance(self.dtype, np.dtype): + if type(self) is Index and not isinstance(self.dtype, np.dtype): # TODO: what about "integer-na" if self.dtype.kind in ["i", "u"] and result.inferred_type == "integer": # TODO: worry about itemsize/overflows? @@ -6474,26 +6481,21 @@ def insert(self, loc: int, item) -> Index: if is_valid_na_for_dtype(item, self.dtype) and self.dtype != object: item = self._na_value + arr = self._values + try: - item = self._validate_fill_value(item) - except TypeError: + if isinstance(arr, ExtensionArray): + res_values = arr.insert(loc, item) + return type(self)._simple_new(res_values, name=self.name) + else: + item = self._validate_fill_value(item) + except (TypeError, ValueError): + # e.g. trying to insert an integer into a DatetimeIndex + # We cannot keep the same dtype, so cast to the (often object) + # minimal shared dtype before doing the insert. dtype = self._find_common_type_compat(item) return self.astype(dtype).insert(loc, item) - arr = self._values - - if isinstance(arr, ExtensionArray): - # TODO: need EA.insert - try: - arr2 = type(arr)._from_sequence([item], dtype=arr.dtype) - except TypeError: - # TODO: make this into _validate_fill_value - dtype = self._find_common_type_compat(item) - return self.astype(dtype).insert(loc, item) - - res_values = arr._concat_same_type([arr[:loc], arr2, arr[loc:]]) - return type(self)._simple_new(res_values, name=self.name) - if arr.dtype != object or not isinstance( item, (tuple, np.datetime64, np.timedelta64) ): diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index ccd18f54da327..7c7f1b267b5be 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -134,31 +134,6 @@ class ExtensionIndex(Index): # --------------------------------------------------------------------- - def insert(self, loc: int, item) -> Index: - """ - Make new Index inserting new item at location. Follows - Python list.append semantics for negative values. - - Parameters - ---------- - loc : int - item : object - - Returns - ------- - new_index : Index - """ - try: - result = self._data.insert(loc, item) - except (ValueError, TypeError): - # e.g. trying to insert an integer into a DatetimeIndex - # We cannot keep the same dtype, so cast to the (often object) - # minimal shared dtype before doing the insert. - dtype = self._find_common_type_compat(item) - return self.astype(dtype).insert(loc, item) - else: - return type(self)._simple_new(result, name=self.name) - def _validate_fill_value(self, value): """ Convert value to be insertable to underlying array. diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index c0250e2b3e958..2a51e4e559e14 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -11,6 +11,7 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_object_dtype, + is_dtype_equal, ) import pandas as pd @@ -150,6 +151,8 @@ def test_access_by_position(index): assert index[-1] == index[size - 1] msg = f"index {size} is out of bounds for axis 0 with size {size}" + if is_dtype_equal(index.dtype, "string[pyarrow]"): + msg = "index out of bounds" with pytest.raises(IndexError, match=msg): index[size] msg = "single positional indexer is out-of-bounds" diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index ac181af7875b5..72e0fabd3f200 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -120,6 +120,15 @@ def test_getitem_scalar(self, data): result = pd.Series(data)[0] assert isinstance(result, data.dtype.type) + def test_getitem_invalid(self, data): + # TODO: specific exception message? + + with pytest.raises(IndexError): + data["foo"] + + with pytest.raises(IndexError): + data[2.5] + def test_getitem_scalar_na(self, data_missing, na_cmp, na_value): result = data_missing[0] assert na_cmp(result, na_value) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index e9d09c6de64e9..9b5fb5dee6cbb 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -296,10 +296,15 @@ def test_ensure_copied_data(self, index): pass elif type(index) is Index and not isinstance(index.dtype, np.dtype): result = index_type(index.values, copy=False, **init_kwargs) - # FIXME: this is specific to MaskedArray - tm.assert_numpy_array_equal(index._values._data, result._values._data, check_same="same") - tm.assert_numpy_array_equal(index._values._mask, result._values._mask, check_same="same") - + tm.assert_index_equal(result, index) + + if hasattr(index._values, "_mask"): + # FIXME: this is specific to MaskedArray + tm.assert_numpy_array_equal(index._values._data, result._values._data, check_same="same") + tm.assert_numpy_array_equal(index._values._mask, result._values._mask, check_same="same") + else: + # e.g. string[pyarrow] + raise NotImplementedError else: result = index_type(index.values, copy=False, **init_kwargs) tm.assert_numpy_array_equal(index.values, result.values, check_same="same") @@ -410,6 +415,23 @@ def test_insert_base(self, index): # test 0th element assert index[0:4].equals(result.insert(0, index[0])) + def test_insert_out_of_bounds(self, index): + # TypeError/IndexError matches what np.insert raises in these cases + + # TODO: specific exception messags? + if len(index) > 0: + err = TypeError + else: + err = IndexError + with pytest.raises(err): + index.insert(0.5, "foo") + + with pytest.raises(IndexError): + index.insert(len(index) + 1, 1) + + with pytest.raises(IndexError): + index.insert(-len(index) - 1, 1) + def test_delete_base(self, index): if not len(index): return diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index 3f85cd59bff76..d855b547a1fca 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -148,6 +148,8 @@ def test_getitem_error(self, index, item): "are valid indices" ), "index out of bounds", # string[pyarrow] + "Only integers, slices and integer or " + "boolean arrays are valid indices.", # string[pyarrow] ] ) with pytest.raises(IndexError, match=msg): diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index d5c1d6f1533d3..f12826bac7137 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -102,6 +102,10 @@ def test_getitem_ndarray_3d( if type(index) is Index and not isinstance(index._values, np.ndarray): # e.g. Int64 msgs.append("values must be a 1D array") + + # string[pyarrow] + msgs.append("only handle 1-dimensional arrays") + msg = "|".join(msgs) potential_errors = (IndexError, ValueError, NotImplementedError) From 4229dbfd0338e787ff5941ddd7bbbd0129af8fd0 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 23 Oct 2021 13:22:44 -0700 Subject: [PATCH 14/57] tests --- pandas/core/indexes/base.py | 15 --------------- pandas/tests/extension/base/getitem.py | 6 ++++++ pandas/tests/extension/base/setitem.py | 7 +++++++ 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c57ff638acef0..7c586bd6b25a8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -26,7 +26,6 @@ algos as libalgos, index as libindex, lib, - missing as libmissing, ) import pandas._libs.join as libjoin from pandas._libs.lib import ( @@ -35,7 +34,6 @@ ) from pandas._libs.tslibs import ( IncompatibleFrequency, - NaTType, OutOfBoundsDatetime, Timestamp, tz_compare, @@ -69,7 +67,6 @@ can_hold_element, find_common_type, infer_dtype_from, - maybe_cast_pointwise_result, validate_numeric_casting, ) from pandas.core.dtypes.common import ( @@ -3640,7 +3637,6 @@ def get_indexer( indexer = self._engine.get_indexer(target.codes) if self.hasnans and target.hasnans: - # loc = self.get_loc(libmissing.NA) loc = self.get_loc(np.nan) mask = target.isna() indexer[mask] = loc @@ -3659,7 +3655,6 @@ def get_indexer( # Exclude MultiIndex because hasnans raises NotImplementedError # we should only get here if we are unique, so loc is an integer # GH#41934 - # loc = self.get_loc(libmissing.NA) loc = self.get_loc(np.nan) mask = target.isna() indexer[mask] = loc @@ -4831,13 +4826,6 @@ def _validate_fill_value(self, value): TypeError If the value cannot be inserted into an array of this dtype. """ - #if type(self) is Index and self.dtype != object: - # # FIXME: kludge; work this into can_hold_element? - # try: - # type(self._values)._from_sequence([value], dtype=self.dtype) - # except ValueError as err: - # raise TypeError from err - # return value if not can_hold_element(self._values, value): raise TypeError return value @@ -5993,9 +5981,6 @@ def map(self, mapper, na_action=None): new_values, dtype=dtype, copy=False, name=self.name ) - #res_values = maybe_cast_pointwise_result( - # new_values, self.dtype, same_dtype=True - #) result = Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name) if type(self) is Index and not isinstance(self.dtype, np.dtype): diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 72e0fabd3f200..c8b9c03940672 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -129,6 +129,12 @@ def test_getitem_invalid(self, data): with pytest.raises(IndexError): data[2.5] + ub = len(data) + with pytest.raises(IndexError): + data[ub + 1] + with pytest.raises(IndexError): + data[-ub - 1] + def test_getitem_scalar_na(self, data_missing, na_cmp, na_value): result = data_missing[0] assert na_cmp(result, na_value) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 0392ea794237c..69b2b02bd4591 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -367,3 +367,10 @@ def test_delitem_series(self, data): expected = ser[taker] del ser[1] self.assert_series_equal(ser, expected) + + def test_setitem_invalid(self, data, invalid_scalar): + with pytest.raises((ValueError, TypeError)): + data[0] = invalid_scalar + + with pytest.raises((ValueError, TypeError)): + data[:] = invalid_scalar From 2e1843a0cb811a577da156bd11a7990a8ee0c8a7 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 24 Oct 2021 14:38:31 -0700 Subject: [PATCH 15/57] REF: share ExtensionIndex.insert-> Index.insert --- pandas/core/dtypes/missing.py | 4 ++++ pandas/core/indexes/base.py | 15 +++++++++++---- pandas/core/indexes/extension.py | 25 ------------------------- pandas/tests/dtypes/test_missing.py | 9 +++++++++ 4 files changed, 24 insertions(+), 29 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index f5fbd4cc4a7fc..38553bc1be8d6 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -37,6 +37,7 @@ needs_i8_conversion, ) from pandas.core.dtypes.dtypes import ( + CategoricalDtype, ExtensionDtype, IntervalDtype, PeriodDtype, @@ -641,5 +642,8 @@ def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool: elif isinstance(dtype, IntervalDtype): return lib.is_float(obj) or obj is None or obj is libmissing.NA + elif isinstance(dtype, CategoricalDtype): + return is_valid_na_for_dtype(obj, dtype.categories.dtype) + # fallback, default to allowing NaN, None, NA, NaT return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal)) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 05047540c6ccd..e82bd61938f15 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6432,14 +6432,21 @@ def insert(self, loc: int, item) -> Index: if is_valid_na_for_dtype(item, self.dtype) and self.dtype != object: item = self._na_value + arr = self._values + try: - item = self._validate_fill_value(item) - except TypeError: + if isinstance(arr, ExtensionArray): + res_values = arr.insert(loc, item) + return type(self)._simple_new(res_values, name=self.name) + else: + item = self._validate_fill_value(item) + except (TypeError, ValueError): + # e.g. trying to insert an integer into a DatetimeIndex + # We cannot keep the same dtype, so cast to the (often object) + # minimal shared dtype before doing the insert. dtype = self._find_common_type_compat(item) return self.astype(dtype).insert(loc, item) - arr = self._values - if arr.dtype != object or not isinstance( item, (tuple, np.datetime64, np.timedelta64) ): diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index ccd18f54da327..7c7f1b267b5be 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -134,31 +134,6 @@ class ExtensionIndex(Index): # --------------------------------------------------------------------- - def insert(self, loc: int, item) -> Index: - """ - Make new Index inserting new item at location. Follows - Python list.append semantics for negative values. - - Parameters - ---------- - loc : int - item : object - - Returns - ------- - new_index : Index - """ - try: - result = self._data.insert(loc, item) - except (ValueError, TypeError): - # e.g. trying to insert an integer into a DatetimeIndex - # We cannot keep the same dtype, so cast to the (often object) - # minimal shared dtype before doing the insert. - dtype = self._find_common_type_compat(item) - return self.astype(dtype).insert(loc, item) - else: - return type(self)._simple_new(result, name=self.name) - def _validate_fill_value(self, value): """ Convert value to be insertable to underlying array. diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index bf68c4b79bcea..55d0e5e73418e 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -18,6 +18,7 @@ is_scalar, ) from pandas.core.dtypes.dtypes import ( + CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, @@ -739,3 +740,11 @@ def test_is_valid_na_for_dtype_interval(self): dtype = IntervalDtype("datetime64[ns]", "both") assert not is_valid_na_for_dtype(NaT, dtype) + + def test_is_valid_na_for_dtype_categorical(self): + dtype = CategoricalDtype(categories=[0, 1, 2]) + assert is_valid_na_for_dtype(np.nan, dtype) + + assert not is_valid_na_for_dtype(NaT, dtype) + assert not is_valid_na_for_dtype(np.datetime64("NaT", "ns"), dtype) + assert not is_valid_na_for_dtype(np.timedelta64("NaT", "ns"), dtype) From 2bb1dea3c78a7ee95a82e0b0267a49619d18a0f3 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 26 Oct 2021 12:24:47 -0700 Subject: [PATCH 16/57] handle a few more tests --- pandas/core/indexes/base.py | 2 ++ pandas/tests/extension/test_numpy.py | 5 +++++ pandas/tests/indexes/common.py | 30 +++++++++++++++++++++------- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7c586bd6b25a8..efa47ea7220e5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5993,6 +5993,8 @@ def map(self, mapper, na_action=None): result = result.astype(self.dtype, copy=False) elif self.dtype == "boolean" and result.inferred_type == "boolean": result = result.astype(self.dtype, copy=False) + elif self.dtype == "string" and result.inferred_type == "string": + result = result.astype(self.dtype, copy=False) return result diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 0e3e26e7e9500..8a643b999f464 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -446,6 +446,11 @@ def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): expected = pd.DataFrame({"data": data.to_numpy()}) self.assert_frame_equal(result, expected) + @skip_nested + def test_setitem_invalid(self, data, invalid_scalar): + # _nothing_ is invalid for object dtype + super().test_setitem_invalid(data, invalid_scalar) + @skip_nested class TestParsing(BaseNumPyTests, base.BaseParsingTests): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 9b5fb5dee6cbb..1ed001e619dd5 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -36,6 +36,7 @@ Int64Index, UInt64Index, ) +from pandas.core.arrays import BaseMaskedArray class Base: @@ -298,13 +299,26 @@ def test_ensure_copied_data(self, index): result = index_type(index.values, copy=False, **init_kwargs) tm.assert_index_equal(result, index) - if hasattr(index._values, "_mask"): - # FIXME: this is specific to MaskedArray - tm.assert_numpy_array_equal(index._values._data, result._values._data, check_same="same") - tm.assert_numpy_array_equal(index._values._mask, result._values._mask, check_same="same") + if isinstance(index._values, BaseMaskedArray): + assert np.shares_memory(index._values._data, result._values._data) + tm.assert_numpy_array_equal( + index._values._data, result._values._data, check_same="same" + ) + assert np.shares_memory(index._values._mask, result._values._mask) + tm.assert_numpy_array_equal( + index._values._mask, result._values._mask, check_same="same" + ) + elif index.dtype == "string[python]": + assert np.shares_memory(index._values._ndarray, result._values._ndarray) + tm.assert_numpy_array_equal( + index._values._ndarray, result._values._ndarray, check_same="same" + ) + elif index.dtype == "string[pyarrow]": + raise NotImplementedError( + "How do we check that we don't have a copy? xref #44152" + ) else: - # e.g. string[pyarrow] - raise NotImplementedError + raise NotImplementedError(index.dtype) else: result = index_type(index.values, copy=False, **init_kwargs) tm.assert_numpy_array_equal(index.values, result.values, check_same="same") @@ -325,7 +339,9 @@ def test_memory_usage(self, index): # RangeIndex, IntervalIndex # don't have engines # Index[EA] has engine but it does not have a Hashtable .mapping - if not isinstance(index, (RangeIndex, IntervalIndex)) and not (type(index) is Index and not isinstance(index.dtype, np.dtype)): + if not isinstance(index, (RangeIndex, IntervalIndex)) and not ( + type(index) is Index and not isinstance(index.dtype, np.dtype) + ): assert result2 > result if index.inferred_type == "object": From 36b662908a960f824cf8a7392a61bdceb5850546 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 27 Oct 2021 12:56:36 -0700 Subject: [PATCH 17/57] update test --- pandas/tests/indexes/common.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 1ed001e619dd5..db5f2fb78bbde 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -314,9 +314,12 @@ def test_ensure_copied_data(self, index): index._values._ndarray, result._values._ndarray, check_same="same" ) elif index.dtype == "string[pyarrow]": - raise NotImplementedError( - "How do we check that we don't have a copy? xref #44152" - ) + # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 + result_pa_data = result._values._data + index_pa_data = index._values._data + res_buf1 = result_pa_data.chunk(0).buffers()[1] + idx_buf1 = index_pa_data.chunk(0).buffers()[1] + assert res_buf1.address == idx_buf1.address else: raise NotImplementedError(index.dtype) else: @@ -434,7 +437,7 @@ def test_insert_base(self, index): def test_insert_out_of_bounds(self, index): # TypeError/IndexError matches what np.insert raises in these cases - # TODO: specific exception messags? + # TODO: specific exception messages? if len(index) > 0: err = TypeError else: From 1881599fd0041d6a4f0048123d034802fc8a41bd Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 29 Oct 2021 12:21:25 -0700 Subject: [PATCH 18/57] Fix remaining tests --- pandas/core/algorithms.py | 2 +- pandas/core/indexes/base.py | 24 +++++++++----------- pandas/tests/base/test_misc.py | 9 ++++++-- pandas/tests/extension/json/array.py | 10 ++++++++- pandas/tests/extension/json/test_json.py | 14 ++++++++++++ pandas/tests/indexes/common.py | 28 +++++------------------- pandas/tests/indexes/test_any_index.py | 14 +----------- pandas/tests/indexes/test_base.py | 5 +++++ pandas/tests/indexes/test_setops.py | 15 ++++++++++++- pandas/tests/test_algos.py | 3 +-- 10 files changed, 68 insertions(+), 56 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c1b587ce3a6b2..a1005d5f2cf9f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1848,5 +1848,5 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike: unique_array = ensure_wrapped_if_datetimelike(unique_array) for i, value in enumerate(unique_array): - indexer += [i] * int(max(l_count[value], r_count[value])) + indexer += [i] * int(max(l_count.at[value], r_count.at[value])) return unique_array.take(indexer) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index efa47ea7220e5..9b741ecd8ac1a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -67,6 +67,7 @@ can_hold_element, find_common_type, infer_dtype_from, + maybe_cast_pointwise_result, validate_numeric_casting, ) from pandas.core.dtypes.common import ( @@ -5974,6 +5975,15 @@ def map(self, mapper, na_action=None): # empty dtype = self.dtype + # e.g. if we are floating and new_values is all ints, then we + # don't want to cast back to floating. But if we are UInt64 + # and new_values is all ints, we want to try. + same_dtype = lib.infer_dtype(new_values, skipna=False) == self.inferred_type + if same_dtype: + new_values = maybe_cast_pointwise_result( + new_values, self.dtype, same_dtype=same_dtype + ) + if self._is_backward_compat_public_numeric_index and is_numeric_dtype( new_values.dtype ): @@ -5982,20 +5992,6 @@ def map(self, mapper, na_action=None): ) result = Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name) - - if type(self) is Index and not isinstance(self.dtype, np.dtype): - # TODO: what about "integer-na" - if self.dtype.kind in ["i", "u"] and result.inferred_type == "integer": - # TODO: worry about itemsize/overflows? - result = result.astype(self.dtype, copy=False) - elif self.dtype.kind == "f" and result.inferred_type == "floating": - # TODO: worry about itemsize/overflows? - result = result.astype(self.dtype, copy=False) - elif self.dtype == "boolean" and result.inferred_type == "boolean": - result = result.astype(self.dtype, copy=False) - elif self.dtype == "string" and result.inferred_type == "string": - result = result.astype(self.dtype, copy=False) - return result # TODO: De-duplicate with map, xref GH#32349 diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 2a51e4e559e14..028fa45658d15 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -10,8 +10,8 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, - is_object_dtype, is_dtype_equal, + is_object_dtype, ) import pandas as pd @@ -77,12 +77,16 @@ def test_memory_usage(index_or_series_obj): res = obj.memory_usage() res_deep = obj.memory_usage(deep=True) + is_ser = isinstance(obj, Series) is_object = is_object_dtype(obj) or ( isinstance(obj, Series) and is_object_dtype(obj.index) ) is_categorical = is_categorical_dtype(obj.dtype) or ( isinstance(obj, Series) and is_categorical_dtype(obj.index.dtype) ) + is_string = is_dtype_equal(obj, "string[python]") or ( + is_ser and is_dtype_equal(obj.index.dtype, "string[python]") + ) if len(obj) == 0: if isinstance(obj, Index): @@ -90,9 +94,10 @@ def test_memory_usage(index_or_series_obj): else: expected = 108 if IS64 else 64 assert res_deep == res == expected - elif is_object or is_categorical: + elif is_object or is_categorical or is_string: # only deep will pick them up assert res_deep > res + assert res_deep > res else: assert res == res_deep diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 2eef828288e59..7bcd666f08b12 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -39,7 +39,10 @@ ExtensionArray, ExtensionDtype, ) -from pandas.api.types import is_bool_dtype +from pandas.api.types import ( + is_bool_dtype, + is_list_like, +) class JSONDtype(ExtensionDtype): @@ -103,6 +106,11 @@ def __getitem__(self, item): elif isinstance(item, slice): # slice return type(self)(self.data[item]) + elif not is_list_like(item): + raise IndexError( + "Only integers, slices and integer or " + "boolean arrays are valid indices." + ) else: item = pd.api.indexers.check_array_indexer(self, item) if is_bool_dtype(item.dtype): diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index f090396a70724..d5ecd8909319e 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -306,6 +306,20 @@ def test_groupby_extension_apply(self): we'll be able to dispatch unique. """ + @unhashable + def test_groupby_extension_agg(self): + """ + This fails when we get to tm.assert_series_equal when left.index + contains dictionaries, which are not hashable. + """ + + @unhashable + def test_groupby_extension_no_sort(self): + """ + This fails when we get to tm.assert_series_equal when left.index + contains dictionaries, which are not hashable. + """ + @pytest.mark.xfail(reason="GH#39098: Converts agg result to object") def test_groupby_agg_extension(self, data_for_grouping): super().test_groupby_agg_extension(data_for_grouping) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index db5f2fb78bbde..53b109139cc65 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -10,9 +10,7 @@ from pandas.core.dtypes.common import ( is_datetime64tz_dtype, - is_float_dtype, is_integer_dtype, - is_unsigned_integer_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype @@ -641,20 +639,9 @@ def test_map(self, simple_index): # callable idx = simple_index - # we don't infer UInt64 - if is_integer_dtype(idx.dtype): - expected = idx.astype("int64") - elif is_float_dtype(idx.dtype): - expected = idx.astype("float64") - if idx._is_backward_compat_public_numeric_index: - # We get a NumericIndex back, not Float64Index - expected = type(idx)(expected) - else: - expected = idx - result = idx.map(lambda x: x) # For RangeIndex we convert to Int64Index - tm.assert_index_equal(result, expected, exact="equiv") + tm.assert_index_equal(result, idx, exact="equiv") @pytest.mark.parametrize( "mapper", @@ -671,23 +658,20 @@ def test_map_dictlike(self, mapper, simple_index): identity = mapper(idx.values, idx) - # we don't infer to UInt64 for a dict - if is_unsigned_integer_dtype(idx.dtype) and isinstance(identity, dict): - expected = idx.astype("int64") - else: - expected = idx - result = idx.map(identity) # For RangeIndex we convert to Int64Index - tm.assert_index_equal(result, expected, exact="equiv") + tm.assert_index_equal(result, idx, exact="equiv") # empty mappable + dtype = None if idx._is_backward_compat_public_numeric_index: new_index_cls = NumericIndex + if idx.dtype.kind == "f": + dtype = idx.dtype else: new_index_cls = Float64Index - expected = new_index_cls([np.nan] * len(idx)) + expected = new_index_cls([np.nan] * len(idx), dtype=dtype) result = idx.map(mapper(expected, idx)) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index 3762c0efe1cf8..29d9eb9909338 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -3,11 +3,8 @@ """ import re -import numpy as np import pytest -from pandas.core.dtypes.common import is_float_dtype - import pandas._testing as tm @@ -49,16 +46,7 @@ def test_mutability(index): def test_map_identity_mapping(index): # GH#12766 result = index.map(lambda x: x) - if index._is_backward_compat_public_numeric_index: - if is_float_dtype(index.dtype): - expected = index.astype(np.float64) - elif index.dtype == np.uint64: - expected = index.astype(np.uint64) - else: - expected = index.astype(np.int64) - else: - expected = index - tm.assert_index_equal(result, expected, exact="equiv") + tm.assert_index_equal(result, index, exact="equiv") def test_wrong_number_names(index): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 9abf1abbc4365..9bcbeb411e51f 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -722,10 +722,15 @@ def test_map_dictlike(self, index, mapper): exp_dtype = np.float64 else: exp_dtype = np.int64 + exp_dtype = index.dtype expected = index._constructor(np.arange(len(index), 0, -1), dtype=exp_dtype) elif type(index) is Index and index.dtype != object: # i.e. EA-backed, for now just Nullable expected = Index(np.arange(len(index), 0, -1), dtype=index.dtype) + elif index.dtype.kind == "u": + # TODO: case where e.g. we cannot hold result in UInt8? + expected = Index(np.arange(len(index), 0, -1), dtype=index.dtype) + else: expected = Index(np.arange(len(index), 0, -1)) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index a0e97223435e6..bec0a41b9bb6b 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -9,6 +9,7 @@ import pytest from pandas.core.dtypes.cast import find_common_type +from pandas.core.dtypes.common import is_dtype_equal from pandas import ( CategoricalIndex, @@ -46,12 +47,24 @@ def test_union_same_types(index): assert idx1.union(idx2).dtype == idx1.dtype -def test_union_different_types(index_flat, index_flat2): +def test_union_different_types(index_flat, index_flat2, request): # This test only considers combinations of indices # GH 23525 idx1 = index_flat idx2 = index_flat2 + if ( + not idx1.is_unique + and idx1.dtype.kind == "i" + and is_dtype_equal(idx2.dtype, "boolean") + ) or ( + not idx2.is_unique + and idx2.dtype.kind == "i" + and is_dtype_equal(idx1.dtype, "boolean") + ): + mark = pytest.mark.xfail(reason="GH#44000 True==1", raises=ValueError) + request.node.add_marker(mark) + common_dtype = find_common_type([idx1.dtype, idx2.dtype]) any_uint64 = idx1.dtype == np.uint64 or idx2.dtype == np.uint64 diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 779d6e6b6bb0f..2c7b5a7cccd95 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2446,6 +2446,5 @@ def test_union_with_duplicates(op): result = algos.union_with_duplicates(lvals, rvals) tm.assert_numpy_array_equal(result, expected) else: - with tm.assert_produces_warning(RuntimeWarning): - result = algos.union_with_duplicates(lvals, rvals) + result = algos.union_with_duplicates(lvals, rvals) tm.assert_extension_array_equal(result, expected) From 1f973251c1014e540d42316d3569f59d50ceb12a Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 29 Oct 2021 13:48:22 -0700 Subject: [PATCH 19/57] no-pyarrow-compat --- pandas/conftest.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index ef48dc0055615..b87a5e286c640 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -66,6 +66,14 @@ MultiIndex, ) +try: + import pyarrow as pa +except ImportError: + has_pyarrow = False +else: + del pa + has_pyarrow = True + # Until https://github.com/numpy/numpy/issues/19078 is sorted out, just suppress suppress_npdev_promotion_warning = pytest.mark.filterwarnings( "ignore:Promotion of numbers and bools:FutureWarning" @@ -518,8 +526,10 @@ def _create_mi_with_dt64tz_level(): "nullable_float": Index(np.arange(100), dtype="Float32"), "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"), "string-python": Index(pd.array(tm.makeStringIndex(100), dtype="string[python]")), - "string-pyarrow": Index(pd.array(tm.makeStringIndex(100), dtype="string[pyarrow]")), } +if has_pyarrow: + idx = Index(pd.array(tm.makeStringIndex(100), dtype="string[pyarrow]")) + indices_dict["string-pyarrow"] = idx @pytest.fixture(params=indices_dict.keys()) From 076cada69b3155de6e15120c2f3f737511e6b794 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 29 Oct 2021 13:51:42 -0700 Subject: [PATCH 20/57] mypy fixups --- pandas/_libs/index.pyi | 6 ++++++ pandas/_libs/testing.pyx | 13 ++----------- pandas/core/arrays/masked.py | 11 ++--------- pandas/core/indexes/base.py | 11 ++++++----- 4 files changed, 16 insertions(+), 25 deletions(-) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 446a980487cde..fd521fc446690 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -63,3 +63,9 @@ class BaseMultiIndexCodesEngine: method: str, limit: int | None, ) -> npt.NDArray[np.intp]: ... + +class NullableEngine: + pass + +class ExtensionEngine: + pass diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index d2566123815f1..cfe9f40f12452 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -7,7 +7,6 @@ from numpy cimport import_array import_array() -from pandas._libs.missing cimport is_matching_na from pandas._libs.util cimport ( is_array, is_complex_object, @@ -176,18 +175,10 @@ cpdef assert_almost_equal(a, b, assert_class_equal(a, b, obj=obj) if isna(a) and isna(b): + # TODO: Should require same-dtype NA? + # nan / None comparison return True - #if isna(b): - # # TODO: Should require same-dtype NA? - # # nan / None comparison - # return True - # - #assert False, f"expected {a} but got {b}" - #elif isna(b): - # assert False, f"expected {a} but got {b}" - - # TODO: test for tm.assert_whatever with pd.NA that would raise here if a == b: # object comparison return True diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 51429a4fd0418..b49f995ec35f7 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -614,10 +614,7 @@ def value_counts(self, dropna: bool = True) -> Series: data = self._data[~self._mask] value_counts = Index(data).value_counts() - # TODO(extension) - # if we have allow Index to hold an ExtensionArray - # this is easier - index = value_counts.index # ._values.astype(object) + index = value_counts.index # if we want nans, count the mask if dropna: @@ -628,12 +625,8 @@ def value_counts(self, dropna: bool = True) -> Series: counts[-1] = self._mask.sum() index = index.insert(len(index), self.dtype.na_value) - # index = Index( - # np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]), - # dtype=object, - # ) - index = index.astype(self.dtype) + assert index.dtype == self.dtype mask = np.zeros(len(counts), dtype="bool") counts = IntegerArray(counts, mask) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9b741ecd8ac1a..ca82622b98cbe 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -360,7 +360,10 @@ def _outer_indexer( _typ: str = "index" _data: ExtensionArray | np.ndarray - _data_cls: type[np.ndarray] | type[ExtensionArray] = (np.ndarray, ExtensionArray) + _data_cls: tuple[type[np.ndarray], type[ExtensionArray]] = ( + np.ndarray, + ExtensionArray, + ) _id: object | None = None _name: Hashable = None # MultiIndex.levels previously allowed setting the index name. We @@ -4452,8 +4455,7 @@ def _join_non_unique( if isinstance(join_array, np.ndarray): np.putmask(join_array, mask, right) else: - # error: "ExtensionArray" has no attribute "putmask" - join_array.putmask(mask, right) # type: ignore[attr-defined] + join_array.putmask(mask, right) join_index = self._wrap_joined_index(join_array, other) @@ -5057,8 +5059,7 @@ def putmask(self, mask, value) -> Index: else: # Note: we use the original value here, not converted, as # _validate_fill_value is not idempotent - # error: "ExtensionArray" has no attribute "putmask" - values.putmask(mask, value) # type: ignore[attr-defined] + values.putmask(mask, value) return self._shallow_copy(values) From 1e8a31f321f453fcad5e9bde511cb70753ea46c1 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 29 Oct 2021 22:00:14 -0700 Subject: [PATCH 21/57] remove assertion --- pandas/core/indexes/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ca82622b98cbe..65c401a94d8b2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -844,7 +844,6 @@ def _engine(self) -> libindex.IndexEngine: ): return libindex.ExtensionEngine(self._values) - assert self.dtype != "boolean" # to avoid a reference cycle, bind `target_values` to a local variable, so # `self` is not passed into the lambda. target_values = self._get_engine_target() From 2d5fa6d60363bdee1aa85c87d40d5b436104a2b3 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 31 Oct 2021 11:54:15 -0700 Subject: [PATCH 22/57] restor astype --- pandas/core/arrays/masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index b49f995ec35f7..e8b2d0791a2de 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -626,7 +626,7 @@ def value_counts(self, dropna: bool = True) -> Series: index = index.insert(len(index), self.dtype.na_value) - assert index.dtype == self.dtype + index = index.astype(self.dtype) mask = np.zeros(len(counts), dtype="bool") counts = IntegerArray(counts, mask) From adf3ddbc84b5a80e868d52f9c0ca96f7461be2e8 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 31 Oct 2021 18:49:41 -0700 Subject: [PATCH 23/57] older numpy compat --- pandas/_libs/index.pyx | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 2222bfd61664f..7499011e1060e 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1028,7 +1028,11 @@ cdef class ExtensionEngine: indexer.append(locs) - indexer = np.concatenate(indexer, dtype=np.intp) + try: + indexer = np.concatenate(indexer, dtype=np.intp) + except TypeError: + # numpy<1.20 doesn't accept dtype keyword + indexer = np.concatenate(indexer).astype(np.intp, copy=False) missing = np.array(missing, dtype=np.intp) return indexer, missing @@ -1300,7 +1304,11 @@ cdef class NullableEngine: indexer.append(locs) - indexer = np.concatenate(indexer, dtype=np.intp) + try: + indexer = np.concatenate(indexer, dtype=np.intp) + except TypeError: + # numpy<1.20 doesn't accept dtype keyword + indexer = np.concatenate(indexer).astype(np.intp, copy=False) missing = np.array(missing, dtype=np.intp) return indexer, missing From fd6880e572d0991be6eba7e23645c1e5986f6c00 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 1 Nov 2021 12:47:46 -0700 Subject: [PATCH 24/57] xfail --- pandas/tests/base/test_misc.py | 4 +++- pandas/tests/extension/test_string.py | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 028fa45658d15..dbf76149ed3cd 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -6,6 +6,7 @@ from pandas.compat import ( IS64, PYPY, + pa_version_under1p0, ) from pandas.core.dtypes.common import ( @@ -156,7 +157,8 @@ def test_access_by_position(index): assert index[-1] == index[size - 1] msg = f"index {size} is out of bounds for axis 0 with size {size}" - if is_dtype_equal(index.dtype, "string[pyarrow]"): + if pa_version_under1p0 and is_dtype_equal(index.dtype, "string[pyarrow]"): + # TODO(GH#44276) pa_version_under1p0 check should be unnecessary msg = "index out of bounds" with pytest.raises(IndexError, match=msg): index[size] diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 5049116a9320e..af6c149447d15 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas.compat import pa_version_under2p0 + import pandas as pd from pandas.core.arrays import ArrowStringArray from pandas.core.arrays.string_ import StringDtype @@ -186,7 +188,12 @@ class TestPrinting(base.BasePrintingTests): class TestGroupBy(base.BaseGroupbyTests): - pass + def test_groupby_extension_transform(self, data_for_grouping, request): + if data_for_grouping.dtype.storage == "pyarrow" and pa_version_under2p0: + # failure observed in 1.0.1, not in 2.0 or later + mark = pytest.mark.xfail(reason="pyarrow raises in self._data[item]") + request.node.add_marker(mark) + super().test_groupby_extension_transform(data_for_grouping) class Test2DCompat(base.Dim2CompatTests): From 3d9b9af269c1ac24ebd2d7128ba2371c97a11fda Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 1 Nov 2021 16:52:07 -0700 Subject: [PATCH 25/57] mypy fixup --- pandas/_libs/index.pyi | 42 +++++++++++++++++++++++++++++++-- pandas/_testing/asserters.py | 12 +++++----- pandas/core/indexes/base.py | 21 ++++++++++------- pandas/core/indexes/interval.py | 4 +++- pandas/core/indexes/multi.py | 7 ++++++ pandas/tests/base/test_misc.py | 10 +++++--- 6 files changed, 75 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index fd521fc446690..21f1c3e147a59 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -1,9 +1,17 @@ +from typing import TYPE_CHECKING + import numpy as np from pandas._typing import npt from pandas import MultiIndex +if TYPE_CHECKING: + from pandas.core.arrays import ( + BaseMaskedArray, + ExtensionArray, + ) + class IndexEngine: over_size_threshold: bool def __init__(self, values: np.ndarray): ... @@ -65,7 +73,37 @@ class BaseMultiIndexCodesEngine: ) -> npt.NDArray[np.intp]: ... class NullableEngine: - pass + def __init__(self, values: "BaseMaskedArray"): ... + def __contains__(self, val: object) -> bool: ... + def get_loc(self, val: object) -> int | slice | np.ndarray: ... + def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ... + def get_indexer_non_unique( + self, + targets: np.ndarray, + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... + @property + def is_unique(self) -> bool: ... + @property + def is_monotonic_increasing(self) -> bool: ... + @property + def is_monotonic_decreasing(self) -> bool: ... + def sizeof(self, deep: bool = ...) -> int: ... + def clear_mapping(self): ... class ExtensionEngine: - pass + def __init__(self, values: "ExtensionArray"): ... + def __contains__(self, val: object) -> bool: ... + def get_loc(self, val: object) -> int | slice | np.ndarray: ... + def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ... + def get_indexer_non_unique( + self, + targets: np.ndarray, + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... + @property + def is_unique(self) -> bool: ... + @property + def is_monotonic_increasing(self) -> bool: ... + @property + def is_monotonic_decreasing(self) -> bool: ... + def sizeof(self, deep: bool = ...) -> int: ... + def clear_mapping(self): ... diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index de5057f56fae0..696b237935dd2 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -42,6 +42,7 @@ take_nd, ) from pandas.core.arrays import ( + BaseMaskedArray, DatetimeArray, ExtensionArray, IntervalArray, @@ -403,15 +404,14 @@ def _get_ilevel_values(index, level): if not left.equals(right): mismatch = left._values != right._values - if not isinstance(mismatch, np.ndarray): - # i.e. its a MaskedArray + if isinstance(mismatch, BaseMaskedArray): + lvalues = cast(BaseMaskedArray, left._values) + rvalues = cast(BaseMaskedArray, right._values) mismatch = mismatch.to_numpy(dtype=int, na_value=0) - mismask = left._values._mask ^ right._values._mask + mismask = lvalues._mask ^ rvalues._mask mismatch[mismask] = 1 - diff = ( - np.sum(mismatch.astype(int)) * 100.0 / len(left) - ) + diff = np.sum(mismatch.astype(int)) * 100.0 / len(left) msg = f"{obj} values are different ({np.round(diff, 5)} %)" raise_assert_detail(obj, msg, left, right) else: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 430965491bf61..ea4a3d222581c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -360,7 +360,7 @@ def _outer_indexer( _typ: str = "index" _data: ExtensionArray | np.ndarray - _data_cls: tuple[type[np.ndarray], type[ExtensionArray]] = ( + _data_cls: type[ExtensionArray] | tuple[type[np.ndarray], type[ExtensionArray]] = ( np.ndarray, ExtensionArray, ) @@ -381,7 +381,9 @@ def _outer_indexer( # associated code in pandas 2.0. _is_backward_compat_public_numeric_index: bool = False - _engine_type: type[libindex.IndexEngine] = libindex.ObjectEngine + _engine_type: type[libindex.IndexEngine] | type[libindex.NullableEngine] | type[ + libindex.ExtensionEngine + ] = libindex.ObjectEngine # whether we support partial string indexing. Overridden # in DatetimeIndex and PeriodIndex _supports_partial_string_indexing = False @@ -833,7 +835,9 @@ def _cleanup(self) -> None: self._engine.clear_mapping() @cache_readonly - def _engine(self) -> libindex.IndexEngine: + def _engine( + self, + ) -> libindex.IndexEngine | libindex.NullableEngine | libindex.ExtensionEngine: # For base class (object dtype) we get ObjectEngine if isinstance(self._values, BaseMaskedArray): @@ -844,10 +848,11 @@ def _engine(self) -> libindex.IndexEngine: ): return libindex.ExtensionEngine(self._values) + engine_type = cast(type[libindex.IndexEngine], self._engine_type) # to avoid a reference cycle, bind `target_values` to a local variable, so # `self` is not passed into the lambda. target_values = self._get_engine_target() - return self._engine_type(target_values) + return engine_type(target_values) @final @cache_readonly @@ -3711,7 +3716,7 @@ def _get_indexer( if target._is_multi and self._is_multi: engine = self._engine # error: "IndexEngine" has no attribute "_extract_level_codes" - tgt_values = engine._extract_level_codes( # type: ignore[attr-defined] + tgt_values = engine._extract_level_codes( # type: ignore[union-attr] target ) @@ -3791,7 +3796,7 @@ def _get_fill_indexer( # TODO: get_indexer_with_fill docstring says values must be _sorted_ # but that doesn't appear to be enforced # error: "IndexEngine" has no attribute "get_indexer_with_fill" - return self._engine.get_indexer_with_fill( # type: ignore[attr-defined] + return self._engine.get_indexer_with_fill( # type: ignore[union-attr] target=target._values, values=self._values, method=method, limit=limit ) @@ -5631,9 +5636,7 @@ def get_indexer_non_unique( if self._is_multi and target._is_multi: engine = self._engine # error: "IndexEngine" has no attribute "_extract_level_codes" - tgt_values = engine._extract_level_codes( # type: ignore[attr-defined] - target - ) + tgt_values = engine._extract_level_codes(target) # type: ignore[union-attr] indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return ensure_platform_int(indexer), ensure_platform_int(missing) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index ed8072f7b0dd5..7c61b31b8ef66 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -731,7 +731,9 @@ def _get_indexer_pointwise( locs = np.array(locs, ndmin=1) else: # FIXME: This is wrong; its boolean; not reached - assert locs.dtype.kind == "i" + # error: Item "int" of "Union[int, ndarray[Any, Any]]" + # has no attribute "dtype" + assert locs.dtype.kind == "i" # type: ignore[union-attr] except KeyError: missing.append(i) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fe97d61be7548..febd0ce1279e8 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -283,6 +283,13 @@ class MultiIndex(Index): of the mentioned helper methods. """ + # error: Incompatible types in assignment (expression has type + # "Type[BaseMultiIndexCodesEngine]", base class "Index" defined the type as + # "Union[Type[IndexEngine], Type[NullableEngine], Type[ExtensionEngine]]") + _engine_type: type[ + libindex.BaseMultiIndexCodesEngine + ] = libindex.BaseMultiIndexCodesEngine # type: ignore[assignment] + _hidden_attrs = Index._hidden_attrs | frozenset() # initialize to zero-length tuples to make everything work diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index dbf76149ed3cd..d4708cb4dcfcc 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -6,7 +6,6 @@ from pandas.compat import ( IS64, PYPY, - pa_version_under1p0, ) from pandas.core.dtypes.common import ( @@ -157,8 +156,13 @@ def test_access_by_position(index): assert index[-1] == index[size - 1] msg = f"index {size} is out of bounds for axis 0 with size {size}" - if pa_version_under1p0 and is_dtype_equal(index.dtype, "string[pyarrow]"): - # TODO(GH#44276) pa_version_under1p0 check should be unnecessary + try: + eq = is_dtype_equal(index.dtype, "string[pyarrow]") + except ImportError: + # TODO(GH#44276) is_dtype_equal can raise here + eq = False + + if eq: msg = "index out of bounds" with pytest.raises(IndexError, match=msg): index[size] From 2d75377c5812d3d62fab6c395bdc171272d23cd9 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Nov 2021 10:02:37 -0700 Subject: [PATCH 26/57] lint fixups --- pandas/_libs/lib.pxd | 1 + pandas/core/indexes/base.py | 2 +- pandas/core/indexing.py | 4 +++- pandas/tests/base/test_value_counts.py | 4 ++-- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pxd b/pandas/_libs/lib.pxd index 1306960b403e2..46a339f2e7cbb 100644 --- a/pandas/_libs/lib.pxd +++ b/pandas/_libs/lib.pxd @@ -1,5 +1,6 @@ from numpy cimport ndarray + cdef bint c_is_list_like(object, bint) except -1 cpdef ndarray eq_NA_compat(ndarray[object] arr, object key) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ea4a3d222581c..9f2ca205d9efe 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -848,7 +848,7 @@ def _engine( ): return libindex.ExtensionEngine(self._values) - engine_type = cast(type[libindex.IndexEngine], self._engine_type) + engine_type = cast("type[libindex.IndexEngine]", self._engine_type) # to avoid a reference cycle, bind `target_values` to a local variable, so # `self` is not passed into the lambda. target_values = self._get_engine_target() diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index a78cc6bbb8bee..68e914a186381 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -993,7 +993,9 @@ def _validate_key(self, key, axis: int): # slice of labels (where start-end in labels) # slice of integers (only if in the labels) # boolean not in slice and with boolean index - if isinstance(key, bool) and not (is_bool_dtype(self.obj.index) or self.obj.index.dtype.name == "boolean"): + if isinstance(key, bool) and not ( + is_bool_dtype(self.obj.index) or self.obj.index.dtype.name == "boolean" + ): raise KeyError( f"{key}: boolean label can not be used without a boolean index" ) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 3566ad5df226c..a5f143590ba03 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -85,8 +85,8 @@ def test_value_counts_null(null_obj, index_or_series_obj): # can't use expected[null_obj] = 3 as # IntervalIndex doesn't allow assignment - #new_entry = Series({np.nan: 3}, dtype=np.int64) - #expected = expected.append(new_entry) # TODO: test that both of these work with IntegerNAIndex + # TODO: test that both expected.append(Series({np.nan: 3}, dtype=np.int64)) + # and expected[null_obj] = 3 work with IntegerNAIndex expected[null_obj] = 3 result = obj.value_counts(dropna=False) From 37b9370334437354ad2d99c0fe2cc489c30410b6 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Nov 2021 20:43:16 -0700 Subject: [PATCH 27/57] avoid warnings --- pandas/_libs/index.pyx | 3 +-- pandas/core/indexes/base.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 7499011e1060e..c289e21dba87e 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1199,8 +1199,7 @@ cdef class NullableEngine: cdef _get_bool_indexer(self, val): if val is NA: - #if not self.has_missing: - # raise KeyError(val) + # TODO: KeyError(val) if not has_missing? # TODO: readonly? copy? return self._mask.view("uint8") diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9f2ca205d9efe..a8b3aaf27fec9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3466,7 +3466,7 @@ def symmetric_difference(self, other, result_name=None, sort=None): res_values = concat_compat([left_diff, right_diff]) res_values = _maybe_try_sort(res_values, sort) - result = Index(res_values, name=result_name) + result = Index(res_values, name=result_name, dtype=res_values.dtype) if self._is_multi: self = cast("MultiIndex", self) @@ -3490,7 +3490,13 @@ def _assert_can_do_setop(self, other) -> bool: def _convert_can_do_setop(self, other) -> tuple[Index, Hashable]: if not isinstance(other, Index): - other = Index(other, name=self.name) + # TODO(2.0): no need to special-case here once _with_infer + # deprecation is enforced + if hasattr(other, "dtype"): + other = Index(other, name=self.name, dtype=other.dtype) + else: + # e.g. list + other = Index(other, name=self.name) result_name = self.name else: result_name = get_op_result_name(self, other) From 0e56218fa88979723aaa32eed9e3749ae727bf12 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Nov 2021 18:02:56 -0700 Subject: [PATCH 28/57] avoid FutureWarnings --- pandas/core/indexes/category.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 005c5f75e6cfa..e037a7e0ed75f 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -462,7 +462,7 @@ def reindex( else: # e.g. test_reindex_with_categoricalindex, test_reindex_duplicate_target new_target = np.asarray(new_target) - new_target = Index(new_target, name=self.name) + new_target = Index._with_infer(new_target, name=self.name) return new_target, indexer From e8987cd04e41e3fb8d748bda72278f4d8f6c0a53 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Nov 2021 21:40:25 -0700 Subject: [PATCH 29/57] catch RuntimeWarning --- pandas/core/indexes/base.py | 1 + pandas/tests/indexes/test_setops.py | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a8b3aaf27fec9..69e4bd6ae06b2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5845,6 +5845,7 @@ def _maybe_promote(self, other: Index) -> tuple[Index, Index]: # TODO: we dont have tests that get here return type(other)(self), other elif self.inferred_type == "boolean": + return self, other if not is_object_dtype(self.dtype): return self.astype("object"), other.astype("object") diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index bec0a41b9bb6b..5e2c2a55fa713 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -208,7 +208,14 @@ def test_union_base(self, index): first = index[3:] second = index[:5] everything = index - union = first.union(second) + + warn = None + if is_dtype_equal(index.dtype, "boolean") and index.isna().any(): + warn = RuntimeWarning + + msg = "boolean value of NA is ambiguous" + with tm.assert_produces_warning(warn, match=msg): + union = first.union(second) assert tm.equalContents(union, everything) if is_datetime64tz_dtype(index.dtype): From fef88a7e252fc594b8a7ef4afd7ac04d9ebc240d Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Nov 2021 21:43:04 -0700 Subject: [PATCH 30/57] remove unreachable --- pandas/core/indexes/base.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 69e4bd6ae06b2..7538017c29663 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5844,10 +5844,6 @@ def _maybe_promote(self, other: Index) -> tuple[Index, Index]: elif self.inferred_type == "timedelta" and isinstance(other, ABCTimedeltaIndex): # TODO: we dont have tests that get here return type(other)(self), other - elif self.inferred_type == "boolean": - return self, other - if not is_object_dtype(self.dtype): - return self.astype("object"), other.astype("object") elif self.dtype.kind == "u" and other.dtype.kind == "i": # GH#41873 From 63f26ba3d8825e85531cbea6add1ea37023627c9 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 19 Nov 2021 13:14:51 -0800 Subject: [PATCH 31/57] revert no-longer-necessary --- pandas/core/arrays/floating.py | 3 --- pandas/core/arrays/sparse/array.py | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 1dee05954d8d5..e1eb5110ef5fe 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -429,9 +429,6 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): return type(self)(result, mask, copy=False) - def isna(self): - return self._mask | np.isnan(self._data) - _dtype_docstring = """ An ExtensionDtype for {dtype} data. diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 9eeeb93f673f2..c054710a01f75 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -954,10 +954,10 @@ def __getitem__( # mypy doesn't know we have an array here key = cast(np.ndarray, key) return self.take(np.arange(len(key), dtype=np.int32)[key]) - elif lib.is_list_like(key): + elif hasattr(key, "__len__"): return self.take(key) else: - raise IndexError(f"Cannot slice with '{key}'") + raise ValueError(f"Cannot slice with '{key}'") return type(self)(data_slice, kind=self.kind) From 11d35642c0333e69809bdba4175ab053bb0c8386 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 19 Nov 2021 18:50:11 -0800 Subject: [PATCH 32/57] Share ExtensionEngine/NullableEngine methods --- pandas/_libs/index.pyx | 358 ++++++++--------------------- pandas/core/arrays/string_arrow.py | 6 +- pandas/core/indexes/base.py | 11 +- 3 files changed, 105 insertions(+), 270 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index c289e21dba87e..e6c0914e9e233 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -794,8 +794,9 @@ cdef class BaseMultiIndexCodesEngine: include "index_class_helper.pxi" +@cython.internal @cython.freelist(32) -cdef class ExtensionEngine: +cdef class SharedEngine: cdef readonly: object values # ExtensionArray bint over_size_threshold @@ -804,13 +805,14 @@ cdef class ExtensionEngine: bint unique, monotonic_inc, monotonic_dec bint need_monotonic_check, need_unique_check - def __init__(self, values: "ExtensionArray"): - self.values = values - - self.over_size_threshold = len(values) >= _SIZE_CUTOFF - self.need_unique_check = True - self.need_monotonic_check = True - self.need_unique_check = True + def __contains__(self, val: object) -> bool: + # We assume before we get here: + # - val is hashable + try: + self.get_loc(val) + return True + except KeyError: + return False def clear_mapping(self): # for compat with IndexEngine @@ -825,6 +827,9 @@ cdef class ExtensionEngine: self.need_unique_check = False return self.unique + cdef _do_monotonic_check(self): + raise NotImplementedError + @property def is_monotonic_increasing(self) -> bool: if self.need_monotonic_check: @@ -839,37 +844,19 @@ cdef class ExtensionEngine: return self.monotonic_dec == 1 - cdef inline _do_monotonic_check(self): - cdef: - bint is_unique - - # FIXME: shouldn't depend on non-required _values_for_argsort - try: - self.monotonic_inc, self.monotonic_dec, is_unique = \ - self._call_monotonic(self.values._values_for_argsort()) - except TypeError: - self.monotonic_inc = 0 - self.monotonic_dec = 0 - is_unique = 0 + cdef _call_monotonic(self, values): + return algos.is_monotonic(values, timelike=False) - self.need_monotonic_check = 0 + def sizeof(self, deep: bool = False) -> int: + """ return the sizeof our mapping """ + return 0 - # we can only be sure of uniqueness if is_unique=1 - if is_unique: - self.unique = 1 - self.need_unique_check = 0 + def __sizeof__(self) -> int: + return self.sizeof() - cdef _call_monotonic(self, values): - return algos.is_monotonic(values, timelike=False) + cdef _check_type(self, object obj): + raise NotImplementedError - def __contains__(self, val: object) -> bool: - # We assume before we get here: - # - val is hashable - try: - self.get_loc(val) - return True - except KeyError: - return False cpdef get_loc(self, object val): # -> Py_ssize_t | slice | ndarray[bool] @@ -899,17 +886,6 @@ cdef class ExtensionEngine: return self._get_loc_duplicates(val) - cdef Py_ssize_t _searchsorted_left(self, val) except? -1: - """ - See ObjectEngine._searchsorted_left.__doc__. - """ - try: - loc = self.values.searchsorted(val, side="left") - except TypeError as err: - # GH#35788 e.g. val=None with float64 values - raise KeyError(val) - return loc - cdef inline _get_loc_duplicates(self, object val): # -> Py_ssize_t | slice | ndarray[bool] cdef: @@ -934,22 +910,19 @@ cdef class ExtensionEngine: return self._maybe_get_bool_indexer(val) - cdef ndarray _get_bool_indexer(self, val): - if checknull(val): - return self.values.isna().view("uint8") - + cdef Py_ssize_t _searchsorted_left(self, val) except? -1: + """ + See ObjectEngine._searchsorted_left.__doc__. + """ try: - return self.values == val - except TypeError: - # e.g. if __eq__ returns a BooleanArray instead of ndarry[bool] - try: - return (self.values == val).to_numpy(dtype=bool, na_value=False) - except (TypeError, AttributeError) as err: - # e.g. (self.values == val) returned a bool - # see test_get_loc_generator[string[pyarrow]] - # e.g. self.value == val raises TypeError bc generator has no len - # see test_get_loc_generator[string[python]] - raise KeyError from err + loc = self.values.searchsorted(val, side="left") + except TypeError as err: + # GH#35788 e.g. val=None with float64 values + raise KeyError(val) + return loc + + cdef ndarray _get_bool_indexer(self, val): + raise NotImplementedError cdef _maybe_get_bool_indexer(self, object val): # Returns ndarray[bool] or int @@ -959,17 +932,8 @@ cdef class ExtensionEngine: indexer = self._get_bool_indexer(val) return _unpack_bool_indexer(indexer, val) - def sizeof(self, deep: bool = False) -> int: - """ return the sizeof our mapping """ - return 0 - - def __sizeof__(self) -> int: - return self.sizeof() - - cdef _check_type(self, object val): - hash(val) - - def get_indexer(self, values: "ExtensionArray") -> np.ndarray: + def get_indexer(self, values) -> np.ndarray: + # values : type(self.values) # Note: we only get here with self.is_unique cdef: Py_ssize_t i, N = len(values) @@ -989,13 +953,17 @@ cdef class ExtensionEngine: return res - def get_indexer_non_unique(self, targets: "ExtensionArray"): + def get_indexer_non_unique(self, targets): """ Return an indexer suitable for taking from a non unique index return the labels in the same order as the target and a missing indexer into the targets (which correspond to the -1 indices in the results + Parameters + ---------- + targets : type(self.values) + Returns ------- indexer : np.ndarray[np.intp] @@ -1038,18 +1006,61 @@ cdef class ExtensionEngine: return indexer, missing -@cython.freelist(32) -cdef class NullableEngine: +cdef class ExtensionEngine(SharedEngine): + def __init__(self, values: "ExtensionArray"): + self.values = values + + self.over_size_threshold = len(values) >= _SIZE_CUTOFF + self.need_unique_check = True + self.need_monotonic_check = True + self.need_unique_check = True + + cdef _do_monotonic_check(self): + cdef: + bint is_unique + + # FIXME: shouldn't depend on non-required _values_for_argsort + try: + self.monotonic_inc, self.monotonic_dec, is_unique = \ + self._call_monotonic(self.values._values_for_argsort()) + except TypeError: + self.monotonic_inc = 0 + self.monotonic_dec = 0 + is_unique = 0 + + self.need_monotonic_check = 0 + + # we can only be sure of uniqueness if is_unique=1 + if is_unique: + self.unique = 1 + self.need_unique_check = 0 + + cdef ndarray _get_bool_indexer(self, val): + if checknull(val): + return self.values.isna().view("uint8") + + try: + return self.values == val + except TypeError: + # e.g. if __eq__ returns a BooleanArray instead of ndarry[bool] + try: + return (self.values == val).to_numpy(dtype=bool, na_value=False) + except (TypeError, AttributeError) as err: + # e.g. (self.values == val) returned a bool + # see test_get_loc_generator[string[pyarrow]] + # e.g. self.value == val raises TypeError bc generator has no len + # see test_get_loc_generator[string[python]] + raise KeyError from err + + cdef _check_type(self, object val): + hash(val) + + +cdef class NullableEngine(SharedEngine): cdef readonly: ndarray _values, _mask - bint over_size_threshold bint has_missing - object values # MaskedArray - - cdef: - bint unique, monotonic_inc, monotonic_dec - bint need_monotonic_check, need_unique_check def __init__(self, values: "MaskedArray"): self.values = values @@ -1062,34 +1073,7 @@ cdef class NullableEngine: self.over_size_threshold = len(values) >= _SIZE_CUTOFF self.need_unique_check = True - def clear_mapping(self): - # for compat with IndexEngine - pass - - @property - def is_unique(self) -> bool: - if self.need_unique_check: - arr = self.values.unique() - self.unique = len(arr) == len(self.values) - - self.need_unique_check = False - return self.unique - - @property - def is_monotonic_increasing(self) -> bool: - if self.need_monotonic_check: - self._do_monotonic_check() - - return self.monotonic_inc == 1 - - @property - def is_monotonic_decreasing(self) -> bool: - if self.need_monotonic_check: - self._do_monotonic_check() - - return self.monotonic_dec == 1 - - cdef inline _do_monotonic_check(self): + cdef _do_monotonic_check(self): cdef: bint is_unique @@ -1115,26 +1099,11 @@ cdef class NullableEngine: self.unique = 1 self.need_unique_check = 0 - cdef _call_monotonic(self, values): - return algos.is_monotonic(values, timelike=False) - - def __contains__(self, val: object) -> bool: - # We assume before we get here: - # - val is hashable - try: - self.get_loc(val) - return True - except KeyError: - return False - cpdef get_loc(self, object val): # -> Py_ssize_t | slice | ndarray[bool] cdef: Py_ssize_t loc - if is_definitely_invalid_key(val): - raise TypeError(f"'{val}' is an invalid key") - if val is NA: # TODO: return copy? readonly view? # TODO: do this later on to keep same pattern as IndexEngine? @@ -1142,62 +1111,9 @@ cdef class NullableEngine: raise KeyError(val) return _unpack_bool_indexer(self._mask, val) - self._check_type(val) - - if self.over_size_threshold and self.is_monotonic_increasing: - if not self.is_unique: - return self._get_loc_duplicates(val) - - values = self.values - - loc = self._searchsorted_left(val) - if loc >= len(values): - raise KeyError(val) - if values[loc] != val: - raise KeyError(val) - return loc - - if not self.unique: - return self._get_loc_duplicates(val) - - return self._get_loc_duplicates(val) - - cdef Py_ssize_t _searchsorted_left(self, val) except? -1: - """ - See ObjectEngine._searchsorted_left.__doc__. - """ - try: - loc = self.values.searchsorted(val, side="left") - except TypeError as err: - # GH#35788 e.g. val=None with float64 values - raise KeyError(val) - return loc - - cdef inline _get_loc_duplicates(self, object val): - # -> Py_ssize_t | slice | ndarray[bool] - cdef: - Py_ssize_t diff - - if self.is_monotonic_increasing: - values = self.values - try: - left = values.searchsorted(val, side='left') - right = values.searchsorted(val, side='right') - except TypeError: - # e.g. GH#29189 get_loc(None) with a Float64Index - raise KeyError(val) - - diff = right - left - if diff == 0: - raise KeyError(val) - elif diff == 1: - return left - else: - return slice(left, right) - - return self._maybe_get_bool_indexer(val) + return SharedEngine.get_loc(self, val) - cdef _get_bool_indexer(self, val): + cdef ndarray _get_bool_indexer(self, val): if val is NA: # TODO: KeyError(val) if not has_missing? # TODO: readonly? copy? @@ -1211,21 +1127,6 @@ cdef class NullableEngine: res[self._mask] = False return res - cdef _maybe_get_bool_indexer(self, object val): - # Returns ndarray[bool] or int - cdef: - ndarray[uint8_t, ndim=1, cast=True] indexer - - indexer = self._get_bool_indexer(val) - return _unpack_bool_indexer(indexer, val) - - def sizeof(self, deep: bool = False) -> int: - """ return the sizeof our mapping """ - return 0 - - def __sizeof__(self) -> int: - return self.sizeof() - cdef _check_type(self, object val): kind = self._values.dtype.kind if kind in ["i", "u"]: @@ -1242,72 +1143,3 @@ cdef class NullableEngine: if not util.is_integer_object(val) and not util.is_float_object(val): # in particular catch bool and avoid casting True -> 1.0 raise KeyError(val) - - def get_indexer(self, values: "MaskedArray") -> np.ndarray: - # Note: we only get here with self.is_unique - cdef: - Py_ssize_t i, N = len(values) - - res = np.empty(N, dtype=np.intp) - - for i in range(N): - val = values[i] - try: - loc = self.get_loc(val) - # Because we are unique, loc should always be an integer - except KeyError: - loc = -1 - else: - assert util.is_integer_object(loc), (loc, val) - - res[i] = loc - - return res - - def get_indexer_non_unique(self, targets: "MaskedArray"): - """ - Return an indexer suitable for taking from a non unique index - return the labels in the same order as the target - and a missing indexer into the targets (which correspond - to the -1 indices in the results - - Returns - ------- - indexer : np.ndarray[np.intp] - missing : np.ndarray[np.intp] - """ - cdef: - Py_ssize_t i, N = len(targets) - - indexer = [] - missing = [] - - # See also IntervalIndex.get_indexer_pointwise - for i in range(N): - val = targets[i] - - try: - locs = self.get_loc(val) - except KeyError: - locs = np.array([-1], dtype=np.intp) - missing.append(i) - else: - if isinstance(locs, slice): - # Only needed for get_indexer_non_unique - locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp) - elif util.is_integer_object(locs): - locs = np.array([locs], dtype=np.intp) - else: - assert locs.dtype.kind == "b" - locs = locs.nonzero()[0] - - indexer.append(locs) - - try: - indexer = np.concatenate(indexer, dtype=np.intp) - except TypeError: - # numpy<1.20 doesn't accept dtype keyword - indexer = np.concatenate(indexer).astype(np.intp, copy=False) - missing = np.array(missing, dtype=np.intp) - - return indexer, missing diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 20c34e811fd2a..272e51d2c1120 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -312,7 +312,11 @@ def __getitem__( ) elif isinstance(item, tuple): item = unpack_tuple_and_ellipses(item) - if item is Ellipsis: + + # error: Non-overlapping identity check (left operand type: + # "Union[Union[int, integer[Any]], Union[slice, List[int], + # ndarray[Any, Any]]]", right operand type: "ellipsis") + if item is Ellipsis: # type: ignore[comparison-overlap] # TODO: should be handled by pyarrow? item = slice(None) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a24eeed04e56c..3a802691e4c76 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -381,9 +381,7 @@ def _outer_indexer( # associated code in pandas 2.0. _is_backward_compat_public_numeric_index: bool = False - _engine_type: type[libindex.IndexEngine] | type[libindex.NullableEngine] | type[ - libindex.ExtensionEngine - ] = libindex.ObjectEngine + _engine_type: type[libindex.IndexEngine] = libindex.ObjectEngine # whether we support partial string indexing. Overridden # in DatetimeIndex and PeriodIndex _supports_partial_string_indexing = False @@ -666,7 +664,6 @@ def _with_infer(cls, *args, **kwargs): Constructor that uses the 1.0.x behavior inferring numeric dtypes for ndarray[object] inputs. """ - with warnings.catch_warnings(): warnings.filterwarnings("ignore", ".*the Index constructor", FutureWarning) result = cls(*args, **kwargs) @@ -848,11 +845,10 @@ def _engine( ): return libindex.ExtensionEngine(self._values) - engine_type = cast("type[libindex.IndexEngine]", self._engine_type) # to avoid a reference cycle, bind `target_values` to a local variable, so # `self` is not passed into the lambda. target_values = self._get_engine_target() - return engine_type(target_values) + return self._engine_type(target_values) @final @cache_readonly @@ -967,6 +963,9 @@ def view(self, cls=None): arr = self._data.view("i8") idx_cls = self._dtype_to_subclass(dtype) + # NB: we only get here for subclasses that override + # _data_cls such that it is a type and not a tuple + # of types. arr_cls = idx_cls._data_cls arr = arr_cls(self._data.view("i8"), dtype=dtype) return idx_cls._simple_new(arr, name=self.name) From 09d8bf1082b713cc23af5d4d755c8174313b84eb Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Nov 2021 20:05:02 -0800 Subject: [PATCH 33/57] lint fixup --- pandas/_libs/index.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index e6c0914e9e233..11be0afe4d86a 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -857,7 +857,6 @@ cdef class SharedEngine: cdef _check_type(self, object obj): raise NotImplementedError - cpdef get_loc(self, object val): # -> Py_ssize_t | slice | ndarray[bool] cdef: From 7d783b13ee3b8095944e9246d9a72baccae43852 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Nov 2021 20:38:07 -0800 Subject: [PATCH 34/57] revert no-longer-necessary --- pandas/_testing/asserters.py | 8 -------- pandas/core/indexes/multi.py | 7 ------- 2 files changed, 15 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 354316af42b40..da8c0048decaa 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -43,7 +43,6 @@ take_nd, ) from pandas.core.arrays import ( - BaseMaskedArray, DatetimeArray, ExtensionArray, IntervalArray, @@ -405,13 +404,6 @@ def _get_ilevel_values(index, level): if not left.equals(right): mismatch = left._values != right._values - if isinstance(mismatch, BaseMaskedArray): - lvalues = cast(BaseMaskedArray, left._values) - rvalues = cast(BaseMaskedArray, right._values) - mismatch = mismatch.to_numpy(dtype=int, na_value=0) - mismask = lvalues._mask ^ rvalues._mask - mismatch[mismask] = 1 - diff = np.sum(mismatch.astype(int)) * 100.0 / len(left) msg = f"{obj} values are different ({np.round(diff, 5)} %)" raise_assert_detail(obj, msg, left, right) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e22b19e345fd9..128aa8e282a0d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -284,13 +284,6 @@ class MultiIndex(Index): of the mentioned helper methods. """ - # error: Incompatible types in assignment (expression has type - # "Type[BaseMultiIndexCodesEngine]", base class "Index" defined the type as - # "Union[Type[IndexEngine], Type[NullableEngine], Type[ExtensionEngine]]") - _engine_type: type[ - libindex.BaseMultiIndexCodesEngine - ] = libindex.BaseMultiIndexCodesEngine # type: ignore[assignment] - _hidden_attrs = Index._hidden_attrs | frozenset() # initialize to zero-length tuples to make everything work From de642497e6f1912f65a8f970d9e6d99274453349 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 22 Nov 2021 16:38:19 -0800 Subject: [PATCH 35/57] remove unnecessary from test_setops --- pandas/tests/indexes/test_setops.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index aa3c55cccfcef..d84ef44f236fc 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -209,13 +209,7 @@ def test_union_base(self, index): second = index[:5] everything = index - warn = None - if is_dtype_equal(index.dtype, "boolean") and index.isna().any(): - warn = RuntimeWarning - - msg = "boolean value of NA is ambiguous" - with tm.assert_produces_warning(warn, match=msg): - union = first.union(second) + union = first.union(second) assert tm.equalContents(union, everything) if is_datetime64tz_dtype(index.dtype): From 1bb2901bbf70ae870bcba934133328e41aceb035 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 22 Nov 2021 16:40:03 -0800 Subject: [PATCH 36/57] suggested edits --- pandas/tests/base/test_misc.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index d4708cb4dcfcc..a5af02122097d 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -84,7 +84,7 @@ def test_memory_usage(index_or_series_obj): is_categorical = is_categorical_dtype(obj.dtype) or ( isinstance(obj, Series) and is_categorical_dtype(obj.index.dtype) ) - is_string = is_dtype_equal(obj, "string[python]") or ( + is_object_string = is_dtype_equal(obj, "string[python]") or ( is_ser and is_dtype_equal(obj.index.dtype, "string[python]") ) @@ -94,10 +94,9 @@ def test_memory_usage(index_or_series_obj): else: expected = 108 if IS64 else 64 assert res_deep == res == expected - elif is_object or is_categorical or is_string: + elif is_object or is_categorical or is_object_string: # only deep will pick them up assert res_deep > res - assert res_deep > res else: assert res == res_deep From 23bb32560ee866f0728f1c52713ba838af9e46dd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 26 Nov 2021 12:00:51 +0100 Subject: [PATCH 37/57] actually run the new base extension tests for all EAs --- pandas/tests/extension/base/__init__.py | 1 + pandas/tests/extension/base/ea_index.py | 11 ---------- pandas/tests/extension/base/index.py | 20 +++++++++++++++++++ .../tests/extension/decimal/test_decimal.py | 4 ++++ pandas/tests/extension/json/test_json.py | 4 ++++ pandas/tests/extension/test_boolean.py | 4 ++++ pandas/tests/extension/test_categorical.py | 4 ++++ pandas/tests/extension/test_datetime.py | 4 ++++ pandas/tests/extension/test_floating.py | 4 ++++ pandas/tests/extension/test_integer.py | 4 ++++ pandas/tests/extension/test_interval.py | 4 ++++ pandas/tests/extension/test_numpy.py | 6 ++++++ pandas/tests/extension/test_period.py | 4 ++++ pandas/tests/extension/test_sparse.py | 13 ++++++++++++ pandas/tests/extension/test_string.py | 4 ++++ 15 files changed, 80 insertions(+), 11 deletions(-) delete mode 100644 pandas/tests/extension/base/ea_index.py create mode 100644 pandas/tests/extension/base/index.py diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 910b43a2cd148..876f595dcb03a 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -47,6 +47,7 @@ class TestMyDtype(BaseDtypeTests): from pandas.tests.extension.base.dtype import BaseDtypeTests # noqa from pandas.tests.extension.base.getitem import BaseGetitemTests # noqa from pandas.tests.extension.base.groupby import BaseGroupbyTests # noqa +from pandas.tests.extension.base.index import BaseIndexTests # noqa from pandas.tests.extension.base.interface import BaseInterfaceTests # noqa from pandas.tests.extension.base.io import BaseParsingTests # noqa from pandas.tests.extension.base.methods import BaseMethodsTests # noqa diff --git a/pandas/tests/extension/base/ea_index.py b/pandas/tests/extension/base/ea_index.py deleted file mode 100644 index 8309842f7134f..0000000000000 --- a/pandas/tests/extension/base/ea_index.py +++ /dev/null @@ -1,11 +0,0 @@ -""" -Tests for Indexes backed by arbitrary ExtensionArrays. -""" -import pandas as pd -from pandas.tests.extension.base.base import BaseExtensionTests - - -class BaseExtensionIndexTests(BaseExtensionTests): - def test_index_from_array(self, data): - idx = pd.Index(data) - assert data.dtype == idx.dtype diff --git a/pandas/tests/extension/base/index.py b/pandas/tests/extension/base/index.py new file mode 100644 index 0000000000000..2539c38733a6c --- /dev/null +++ b/pandas/tests/extension/base/index.py @@ -0,0 +1,20 @@ +""" +Tests for Indexes backed by arbitrary ExtensionArrays. +""" +import pandas as pd +from pandas.tests.extension.base.base import BaseExtensionTests + + +class BaseIndexTests(BaseExtensionTests): + """Tests for Index object backed by an ExtensionArray""" + + def test_index_from_array(self, data): + idx = pd.Index(data) + assert data.dtype == idx.dtype + + def test_index_from_listlike_with_dtype(self, data): + idx = pd.Index(data, dtype=data.dtype) + assert idx.dtype == data.dtype + + idx = pd.Index(list(data), dtype=data.dtype) + assert idx.dtype == data.dtype diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 53416b6a3e9db..a00860f4d02da 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -101,6 +101,10 @@ def test_take_na_value_other_decimal(self): self.assert_extension_array_equal(result, expected) +class TestIndex(base.BaseIndexTests): + pass + + class TestMissing(base.BaseMissingTests): pass diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index d5ecd8909319e..d530a75b74c8f 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -196,6 +196,10 @@ class TestGetitem(BaseJSON, base.BaseGetitemTests): pass +class TestIndex(BaseJSON, base.BaseIndexTests): + pass + + class TestMissing(BaseJSON, base.BaseMissingTests): @pytest.mark.skip(reason="Setting a dict as a scalar") def test_fillna_series(self): diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 05455905860d2..a4921b2f0bdf6 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -95,6 +95,10 @@ class TestSetitem(base.BaseSetitemTests): pass +class TestIndex(base.BaseIndexTests): + pass + + class TestMissing(base.BaseMissingTests): pass diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 6a1a9512bc036..f92c7dde9eb92 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -144,6 +144,10 @@ class TestSetitem(base.BaseSetitemTests): pass +class TestIndex(base.BaseIndexTests): + pass + + class TestMissing(base.BaseMissingTests): @pytest.mark.skip(reason="Not implemented") def test_fillna_limit_pad(self, data_missing): diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index de5a6b7a5bb06..0c4759050c7c4 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -107,6 +107,10 @@ class TestGetitem(BaseDatetimeTests, base.BaseGetitemTests): pass +class TestIndex(base.BaseIndexTests): + pass + + class TestMethods(BaseDatetimeTests, base.BaseMethodsTests): @pytest.mark.skip(reason="Incorrect expected") def test_value_counts(self, all_data, dropna): diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index 2b08c5b7be450..a6f1592d55224 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -167,6 +167,10 @@ class TestSetitem(base.BaseSetitemTests): pass +class TestIndex(base.BaseIndexTests): + pass + + class TestMissing(base.BaseMissingTests): pass diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 7d343aab3c7a0..4e64d8332ac63 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -190,6 +190,10 @@ class TestSetitem(base.BaseSetitemTests): pass +class TestIndex(base.BaseIndexTests): + pass + + class TestMissing(base.BaseMissingTests): pass diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 24c0d619e2b1a..11014f245cdce 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -90,6 +90,10 @@ class TestGetitem(BaseInterval, base.BaseGetitemTests): pass +class TestIndex(base.BaseIndexTests): + pass + + class TestGrouping(BaseInterval, base.BaseGroupbyTests): pass diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index e60f7769270bd..3c359eac6deaf 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -219,6 +219,12 @@ def test_getitem_scalar(self, data): super().test_getitem_scalar(data) +# TODO Index.__new__ checks for PandasArray (and converts it explicitly), so the +# monkeypatching to make PandasArray into a regular ExtensionArray doesn't work +# class TestIndex(base.BaseIndexTests): +# pass + + class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): def test_groupby_extension_apply( self, data_for_grouping, groupby_apply_op, request diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index f210a4ce56091..873fe9d6463ad 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -85,6 +85,10 @@ class TestGetitem(BasePeriodTests, base.BaseGetitemTests): pass +class TestIndex(base.BaseIndexTests): + pass + + class TestMethods(BasePeriodTests, base.BaseMethodsTests): def test_combine_add(self, data_repeated): # Period + Period is not defined. diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 012a3fbb12cac..fb3f95f88da71 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -194,6 +194,19 @@ def test_reindex(self, data, na_value): # Skipping TestSetitem, since we don't implement it. +class TestIndex(base.BaseIndexTests): + def test_index_from_array(self, data): + idx = pd.Index(data) + # TODO do we want to preserve the sparse dtype in the index + # now this is possible? + assert idx.dtype == data.dtype.subtype + + # TODO this is failing because it doesn't recognize the sparse dtype + @pytest.mark.xfail(reason="Index cannot yet store sparse dtype") + def test_index_from_listlike_with_dtype(self, data): + super().test_index_from_listlike_with_dtype(data) + + class TestMissing(BaseSparseTests, base.BaseMissingTests): def test_isna(self, data_missing): expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value)) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index af6c149447d15..827a410871329 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -136,6 +136,10 @@ def test_setitem_preserves_views(self, data, request): super().test_setitem_preserves_views(data) +class TestIndex(base.BaseIndexTests): + pass + + class TestMissing(base.BaseMissingTests): pass From 4abd60ea5a75a76b798e7fc186cf331e7139e6d4 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 29 Nov 2021 20:15:50 -0800 Subject: [PATCH 38/57] update tests --- pandas/tests/base/test_conversion.py | 2 +- pandas/tests/indexes/test_numpy_compat.py | 6 +++++- pandas/tests/reductions/test_reductions.py | 2 +- pandas/tests/strings/test_strings.py | 3 --- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index c483d4354a7b9..a4319f1f2b4e2 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -267,7 +267,7 @@ def test_numpy_array_all_dtypes(any_numpy_dtype): ) def test_array(arr, attr, index_or_series, request): box = index_or_series - if arr.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: + if arr.dtype.name in ("Sparse[int64, 0]") and box is pd.Index: mark = pytest.mark.xfail(reason="Needs EA-Backed Index") request.node.add_marker(mark) diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index a619b6756f00a..8bca938218c0a 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -117,11 +117,15 @@ def test_numpy_ufuncs_other(index, func, request): @pytest.mark.parametrize("func", [np.maximum, np.minimum]) -def test_numpy_ufuncs_reductions(index, func): +def test_numpy_ufuncs_reductions(index, func, request): # TODO: overlap with tests.series.test_ufunc.test_reductions if len(index) == 0: return + if repr(index.dtype) == "string[pyarrow]" or index.dtype == "boolean": + mark = pytest.mark.xfail(reason="ArrowStringArray/BooleanArray has no min/max") + request.node.add_marker(mark) + if isinstance(index, CategoricalIndex) and index.dtype.ordered is False: with pytest.raises(TypeError, match="is not ordered for"): func.reduce(index) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index cc7121d2f4656..2cc204102cb74 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -83,7 +83,7 @@ def test_nanminmax(self, opname, dtype, val, index_or_series, request): # GH#7261 klass = index_or_series - if dtype in ["Int64", "boolean"] and klass == Index: + if dtype in ["boolean"] and klass == Index: mark = pytest.mark.xfail(reason="Need EA-backed Index") request.node.add_marker(mark) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 3e49d6367ffd9..461443385a74b 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -367,9 +367,6 @@ def test_len_mixed(): def test_index( method, sub, start, end, index_or_series, any_string_dtype, expected, request ): - if index_or_series is Index and not any_string_dtype == "object": - mark = pytest.mark.xfail(reason="Need EA-backed Index") - request.node.add_marker(mark) obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype From 7f9741efdace376c38be7eb9837c2fbdc439507d Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 30 Nov 2021 08:23:56 -0800 Subject: [PATCH 39/57] older np compat --- pandas/tests/indexes/test_numpy_compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 8bca938218c0a..58711ae0d8afc 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -122,7 +122,7 @@ def test_numpy_ufuncs_reductions(index, func, request): if len(index) == 0: return - if repr(index.dtype) == "string[pyarrow]" or index.dtype == "boolean": + if repr(index.dtype) == "string[pyarrow]" or str(index.dtype) == "boolean": mark = pytest.mark.xfail(reason="ArrowStringArray/BooleanArray has no min/max") request.node.add_marker(mark) From 40e861b8d92aab7a33e8a014c757e6a8c70f1710 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 30 Nov 2021 16:42:51 -0800 Subject: [PATCH 40/57] 32bit compat --- pandas/tests/extension/test_sparse.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index ea085f638342a..564f2cb2915a7 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -199,9 +199,14 @@ def test_index_from_array(self, data): idx = pd.Index(data) # TODO do we want to preserve the sparse dtype in the index # now this is possible? + if data.dtype.subtype == "f": + assert idx.dtype == np.float64 + elif data.dtype.subtype == "i": + assert idx.dtype == np.int64 assert idx.dtype == data.dtype.subtype - # TODO this is failing because it doesn't recognize the sparse dtype + # TODO(ExtensionIndex) this is failing because it doesn't recognize + # the sparse dtype @pytest.mark.xfail(reason="Index cannot yet store sparse dtype") def test_index_from_listlike_with_dtype(self, data): super().test_index_from_listlike_with_dtype(data) From 6e79350532732397d5ee3b1752dd6a859b29e7d7 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 30 Nov 2021 18:44:34 -0800 Subject: [PATCH 41/57] simplify, docstring --- pandas/_libs/lib.pyx | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index b6372543d2fc7..4d2b910071130 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -3041,22 +3041,23 @@ def is_bool_list(obj: list) -> bool: cpdef ndarray eq_NA_compat(ndarray[object] arr, object key): + """ + Check for `arr == key`, treating all values as not-equal to pd.NA. + + key is assumed to have `not isna(key)` + """ cdef: ndarray[uint8_t, cast=True] result = np.empty(len(arr), dtype=bool) Py_ssize_t i object item - if key is C_NA: - for i in range(len(arr)): - item = arr[i] - result[i] = item is C_NA - else: - for i in range(len(arr)): - item = arr[i] - if item is C_NA: - result[i] = False - else: - result[i] = item == key # FIXME: compat for other NAs + for i in range(len(arr)): + item = arr[i] + if item is C_NA: + result[i] = False + else: + result[i] = item == key + return result From 90366e9d3851d007c76b733a6f0da59080d484e0 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 30 Nov 2021 20:40:47 -0800 Subject: [PATCH 42/57] 32bit compat --- pandas/tests/extension/test_sparse.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 564f2cb2915a7..59e098ee59c40 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -203,7 +203,8 @@ def test_index_from_array(self, data): assert idx.dtype == np.float64 elif data.dtype.subtype == "i": assert idx.dtype == np.int64 - assert idx.dtype == data.dtype.subtype + else: + assert idx.dtype == data.dtype.subtype # TODO(ExtensionIndex) this is failing because it doesn't recognize # the sparse dtype From 70debb248a61e80d1d9540330065e0ef2e85fab4 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 1 Dec 2021 11:09:08 -0800 Subject: [PATCH 43/57] Address comments --- pandas/core/dtypes/common.py | 3 ++- pandas/core/indexes/base.py | 1 + pandas/tests/base/test_conversion.py | 2 +- pandas/tests/extension/test_numpy.py | 8 ++------ 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 7ac8e6c47158c..e1770dc713c9a 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1319,7 +1319,8 @@ def is_bool_dtype(arr_or_dtype) -> bool: # we don't have a boolean Index class # so its object, we need to infer to # guess this - return arr_or_dtype.is_object() and arr_or_dtype.inferred_type == "boolean" + # Allow Index[object] that is all-bools or Index["boolean"] + return arr_or_dtype.inferred_type == "boolean" elif is_extension_array_dtype(arr_or_dtype): return getattr(dtype, "_is_boolean", False) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cc6a6430d2fb5..0166a649892ad 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3471,6 +3471,7 @@ def symmetric_difference(self, other, result_name=None, sort=None): res_values = concat_compat([left_diff, right_diff]) res_values = _maybe_try_sort(res_values, sort) + # pass dtype so we retain object dtype result = Index(res_values, name=result_name, dtype=res_values.dtype) if self._is_multi: diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 97037b95e05b6..49afe2859098d 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -257,7 +257,7 @@ def test_numpy_array_all_dtypes(any_numpy_dtype): def test_array(arr, attr, index_or_series, request): box = index_or_series if arr.dtype.name in ("Sparse[int64, 0]") and box is pd.Index: - mark = pytest.mark.xfail(reason="Needs EA-Backed Index") + mark = pytest.mark.xfail(reason="Index cannot yet store sparse dtype") request.node.add_marker(mark) result = box(arr, copy=False).array diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 3c359eac6deaf..34003fd092c18 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -12,6 +12,8 @@ classes (if they are relevant for the extension interface for all dtypes), or be added to the array-specific tests in `pandas/tests/arrays/`. +Note: we do not bother with base.BaseIndexTests because PandasArray +will never be held in an Index. """ import numpy as np import pytest @@ -219,12 +221,6 @@ def test_getitem_scalar(self, data): super().test_getitem_scalar(data) -# TODO Index.__new__ checks for PandasArray (and converts it explicitly), so the -# monkeypatching to make PandasArray into a regular ExtensionArray doesn't work -# class TestIndex(base.BaseIndexTests): -# pass - - class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): def test_groupby_extension_apply( self, data_for_grouping, groupby_apply_op, request From 0339e69f27b0e09b6bc142a41f546196c5be39b7 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 4 Dec 2021 18:34:02 -0800 Subject: [PATCH 44/57] simplify --- pandas/tests/indexes/test_base.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index b383381d9a5c5..7d62b2f6f2247 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -530,21 +530,20 @@ def test_map_dictlike(self, index, mapper): # Cannot map duplicated index return + rng = np.arange(len(index), 0, -1) + if index.empty: # to match proper result coercion for uints expected = Index([]) elif index._is_backward_compat_public_numeric_index: - expected = index._constructor( - np.arange(len(index), 0, -1), dtype=index.dtype - ) + expected = index._constructor(rng, dtype=index.dtype) elif type(index) is Index and index.dtype != object: # i.e. EA-backed, for now just Nullable - expected = Index(np.arange(len(index), 0, -1), dtype=index.dtype) + expected = Index(rng, dtype=index.dtype) elif index.dtype.kind == "u": - # TODO: case where e.g. we cannot hold result in UInt8? - expected = Index(np.arange(len(index), 0, -1), dtype=index.dtype) + expected = Index(rng, dtype=index.dtype) else: - expected = Index(np.arange(len(index), 0, -1)) + expected = Index(rng) result = index.map(mapper(expected, index)) tm.assert_index_equal(result, expected) From 96b25aab8cfab27891adc3af735b323a75ec5af7 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 5 Dec 2021 14:31:06 -0800 Subject: [PATCH 45/57] dont catch np.float16 too early --- pandas/core/arrays/floating.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index f127b1db4c553..ef812b0823ca9 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -251,12 +251,7 @@ def dtype(self) -> FloatingDtype: return FLOAT_STR_TO_DTYPE[str(self._data.dtype)] def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): - if not ( - isinstance(values, np.ndarray) - and values.dtype.kind == "f" - and values.dtype.itemsize > 2 - ): - # We do not support float16 + if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"): raise TypeError( "values should be floating numpy array. Use " "the 'pd.array' function instead" From 812da93a0807e522a35091c4be1248b7eb8a3353 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 7 Dec 2021 11:39:26 -0800 Subject: [PATCH 46/57] de-xfail --- pandas/tests/indexes/test_numpy_compat.py | 4 ++-- pandas/tests/reductions/test_reductions.py | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 58711ae0d8afc..03bdc13d1ea8b 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -122,8 +122,8 @@ def test_numpy_ufuncs_reductions(index, func, request): if len(index) == 0: return - if repr(index.dtype) == "string[pyarrow]" or str(index.dtype) == "boolean": - mark = pytest.mark.xfail(reason="ArrowStringArray/BooleanArray has no min/max") + if repr(index.dtype) == "string[pyarrow]": + mark = pytest.mark.xfail(reason="ArrowStringArray has no min/max") request.node.add_marker(mark) if isinstance(index, CategoricalIndex) and index.dtype.ordered is False: diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 004c087044017..70e739d1440d6 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -79,14 +79,10 @@ def test_ops(self, opname, obj): ("boolean", True), ], ) - def test_nanminmax(self, opname, dtype, val, index_or_series, request): + def test_nanminmax(self, opname, dtype, val, index_or_series): # GH#7261 klass = index_or_series - if dtype in ["boolean"] and klass == Index: - mark = pytest.mark.xfail(reason="Need EA-backed Index") - request.node.add_marker(mark) - def check_missing(res): if dtype == "datetime64[ns]": return res is NaT From e3943942f6e5aec539df01341d42188860cb7044 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 7 Dec 2021 14:54:52 -0800 Subject: [PATCH 47/57] remove edits made extraneous by other PRs --- pandas/core/arrays/floating.py | 2 -- pandas/core/arrays/masked.py | 4 ---- 2 files changed, 6 deletions(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 8611f8c8af0e3..4c868747fa930 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -103,8 +103,6 @@ def coerce_to_array( if dtype is None and hasattr(values, "dtype"): if is_float_dtype(values.dtype): dtype = values.dtype - if dtype == "float16": - raise TypeError("FloatingArray does not support float16 dtype") if dtype is not None: if isinstance(dtype, str) and dtype.startswith("Float"): diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 8e02e21f59f42..1c9c2a4780722 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -468,10 +468,6 @@ def reconstruct(x): m = mask.copy() return IntegerArray(x, m) elif is_float_dtype(x.dtype): - if x.dtype.itemsize <= 2: - # reached in e.g. np.sqrt on BooleanArray - # we don't support float16 - x = x.astype(np.float32) m = mask.copy() if x.dtype == np.float16: # reached in e.g. np.sqrt on BooleanArray From 3e1ec002e99af65aff18e41c88026cb506d6b768 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 13 Dec 2021 08:48:23 -0800 Subject: [PATCH 48/57] suggested edits --- pandas/core/dtypes/common.py | 5 ----- pandas/tests/base/test_value_counts.py | 7 ++----- pandas/tests/indexes/common.py | 7 +------ 3 files changed, 3 insertions(+), 16 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 01737591c2faa..72df30a4c96aa 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1323,11 +1323,6 @@ def is_bool_dtype(arr_or_dtype) -> bool: # now we use the special definition for Index if isinstance(arr_or_dtype, ABCIndex): - - # TODO(jreback) - # we don't have a boolean Index class - # so its object, we need to infer to - # guess this # Allow Index[object] that is all-bools or Index["boolean"] return arr_or_dtype.inferred_type == "boolean" elif is_extension_array_dtype(arr_or_dtype): diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index f9ba713d20dfe..6a479ee347f16 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -32,16 +32,14 @@ def test_value_counts(index_or_series_obj): expected.index = Index(expected.index) if not isinstance(result.dtype, np.dtype): - # TODO: be more specific # i.e IntegerDtype - expected = expected.astype(result.dtype) + expected = expected.astype("Int64") # TODO(GH#32514): Order of entries with the same count is inconsistent # on CI (gh-32449) if obj.duplicated().any(): result = result.sort_index() expected = expected.sort_index() - tm.assert_series_equal(result, expected) @@ -78,9 +76,8 @@ def test_value_counts_null(null_obj, index_or_series_obj): result = result.sort_index() if not isinstance(result.dtype, np.dtype): - # TODO: be more specific # i.e IntegerDtype - expected = expected.astype(result.dtype) + expected = expected.astype("Int64") tm.assert_series_equal(result, expected) expected[null_obj] = 3 diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 7a031e4319654..699ef88bf419f 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -252,12 +252,7 @@ def test_ensure_copied_data(self, index): index._values._ndarray, result._values._ndarray, check_same="same" ) elif index.dtype == "string[pyarrow]": - # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 - result_pa_data = result._values._data - index_pa_data = index._values._data - res_buf1 = result_pa_data.chunk(0).buffers()[1] - idx_buf1 = index_pa_data.chunk(0).buffers()[1] - assert res_buf1.address == idx_buf1.address + assert tm.shares_memory(result._values, index._values) else: raise NotImplementedError(index.dtype) else: From 267b1b33634f81e882372c6476a159acf55c3586 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 15 Dec 2021 15:03:05 -0800 Subject: [PATCH 49/57] Remove NullableEngine, ExtensionEngine --- pandas/_libs/index.pyi | 36 ---- pandas/_libs/index.pyx | 350 ------------------------------------ pandas/core/indexes/base.py | 20 ++- 3 files changed, 14 insertions(+), 392 deletions(-) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 21f1c3e147a59..68163c1284b30 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -71,39 +71,3 @@ class BaseMultiIndexCodesEngine: method: str, limit: int | None, ) -> npt.NDArray[np.intp]: ... - -class NullableEngine: - def __init__(self, values: "BaseMaskedArray"): ... - def __contains__(self, val: object) -> bool: ... - def get_loc(self, val: object) -> int | slice | np.ndarray: ... - def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ... - def get_indexer_non_unique( - self, - targets: np.ndarray, - ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... - @property - def is_unique(self) -> bool: ... - @property - def is_monotonic_increasing(self) -> bool: ... - @property - def is_monotonic_decreasing(self) -> bool: ... - def sizeof(self, deep: bool = ...) -> int: ... - def clear_mapping(self): ... - -class ExtensionEngine: - def __init__(self, values: "ExtensionArray"): ... - def __contains__(self, val: object) -> bool: ... - def get_loc(self, val: object) -> int | slice | np.ndarray: ... - def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ... - def get_indexer_non_unique( - self, - targets: np.ndarray, - ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... - @property - def is_unique(self) -> bool: ... - @property - def is_monotonic_increasing(self) -> bool: ... - @property - def is_monotonic_decreasing(self) -> bool: ... - def sizeof(self, deep: bool = ...) -> int: ... - def clear_mapping(self): ... diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 37ac02a2bd507..c3b86165e6d2c 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -797,353 +797,3 @@ cdef class BaseMultiIndexCodesEngine: # Generated from template. include "index_class_helper.pxi" - - -@cython.internal -@cython.freelist(32) -cdef class SharedEngine: - cdef readonly: - object values # ExtensionArray - bint over_size_threshold - - cdef: - bint unique, monotonic_inc, monotonic_dec - bint need_monotonic_check, need_unique_check - - def __contains__(self, val: object) -> bool: - # We assume before we get here: - # - val is hashable - try: - self.get_loc(val) - return True - except KeyError: - return False - - def clear_mapping(self): - # for compat with IndexEngine - pass - - @property - def is_unique(self) -> bool: - if self.need_unique_check: - arr = self.values.unique() - self.unique = len(arr) == len(self.values) - - self.need_unique_check = False - return self.unique - - cdef _do_monotonic_check(self): - raise NotImplementedError - - @property - def is_monotonic_increasing(self) -> bool: - if self.need_monotonic_check: - self._do_monotonic_check() - - return self.monotonic_inc == 1 - - @property - def is_monotonic_decreasing(self) -> bool: - if self.need_monotonic_check: - self._do_monotonic_check() - - return self.monotonic_dec == 1 - - cdef _call_monotonic(self, values): - return algos.is_monotonic(values, timelike=False) - - def sizeof(self, deep: bool = False) -> int: - """ return the sizeof our mapping """ - return 0 - - def __sizeof__(self) -> int: - return self.sizeof() - - cdef _check_type(self, object obj): - raise NotImplementedError - - cpdef get_loc(self, object val): - # -> Py_ssize_t | slice | ndarray[bool] - cdef: - Py_ssize_t loc - - if is_definitely_invalid_key(val): - raise TypeError(f"'{val}' is an invalid key") - - self._check_type(val) - - if self.over_size_threshold and self.is_monotonic_increasing: - if not self.is_unique: - return self._get_loc_duplicates(val) - - values = self.values - - loc = self._searchsorted_left(val) - if loc >= len(values): - raise KeyError(val) - if values[loc] != val: - raise KeyError(val) - return loc - - if not self.unique: - return self._get_loc_duplicates(val) - - return self._get_loc_duplicates(val) - - cdef inline _get_loc_duplicates(self, object val): - # -> Py_ssize_t | slice | ndarray[bool] - cdef: - Py_ssize_t diff - - if self.is_monotonic_increasing: - values = self.values - try: - left = values.searchsorted(val, side='left') - right = values.searchsorted(val, side='right') - except TypeError: - # e.g. GH#29189 get_loc(None) with a Float64Index - raise KeyError(val) - - diff = right - left - if diff == 0: - raise KeyError(val) - elif diff == 1: - return left - else: - return slice(left, right) - - return self._maybe_get_bool_indexer(val) - - cdef Py_ssize_t _searchsorted_left(self, val) except? -1: - """ - See ObjectEngine._searchsorted_left.__doc__. - """ - try: - loc = self.values.searchsorted(val, side="left") - except TypeError as err: - # GH#35788 e.g. val=None with float64 values - raise KeyError(val) - return loc - - cdef ndarray _get_bool_indexer(self, val): - raise NotImplementedError - - cdef _maybe_get_bool_indexer(self, object val): - # Returns ndarray[bool] or int - cdef: - ndarray[uint8_t, ndim=1, cast=True] indexer - - indexer = self._get_bool_indexer(val) - return _unpack_bool_indexer(indexer, val) - - def get_indexer(self, values) -> np.ndarray: - # values : type(self.values) - # Note: we only get here with self.is_unique - cdef: - Py_ssize_t i, N = len(values) - - res = np.empty(N, dtype=np.intp) - - for i in range(N): - val = values[i] - try: - loc = self.get_loc(val) - # Because we are unique, loc should always be an integer - except KeyError: - loc = -1 - else: - assert util.is_integer_object(loc), (loc, val) - res[i] = loc - - return res - - def get_indexer_non_unique(self, targets): - """ - Return an indexer suitable for taking from a non unique index - return the labels in the same order as the target - and a missing indexer into the targets (which correspond - to the -1 indices in the results - - Parameters - ---------- - targets : type(self.values) - - Returns - ------- - indexer : np.ndarray[np.intp] - missing : np.ndarray[np.intp] - """ - cdef: - Py_ssize_t i, N = len(targets) - - indexer = [] - missing = [] - - # See also IntervalIndex.get_indexer_pointwise - for i in range(N): - val = targets[i] - - try: - locs = self.get_loc(val) - except KeyError: - locs = np.array([-1], dtype=np.intp) - missing.append(i) - else: - if isinstance(locs, slice): - # Only needed for get_indexer_non_unique - locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp) - elif util.is_integer_object(locs): - locs = np.array([locs], dtype=np.intp) - else: - assert locs.dtype.kind == "b" - locs = locs.nonzero()[0] - - indexer.append(locs) - - try: - indexer = np.concatenate(indexer, dtype=np.intp) - except TypeError: - # numpy<1.20 doesn't accept dtype keyword - indexer = np.concatenate(indexer).astype(np.intp, copy=False) - missing = np.array(missing, dtype=np.intp) - - return indexer, missing - - -cdef class ExtensionEngine(SharedEngine): - def __init__(self, values: "ExtensionArray"): - self.values = values - - self.over_size_threshold = len(values) >= _SIZE_CUTOFF - self.need_unique_check = True - self.need_monotonic_check = True - self.need_unique_check = True - - cdef _do_monotonic_check(self): - cdef: - bint is_unique - - # FIXME: shouldn't depend on non-required _values_for_argsort - try: - self.monotonic_inc, self.monotonic_dec, is_unique = \ - self._call_monotonic(self.values._values_for_argsort()) - except TypeError: - self.monotonic_inc = 0 - self.monotonic_dec = 0 - is_unique = 0 - - self.need_monotonic_check = 0 - - # we can only be sure of uniqueness if is_unique=1 - if is_unique: - self.unique = 1 - self.need_unique_check = 0 - - cdef ndarray _get_bool_indexer(self, val): - if checknull(val): - return self.values.isna().view("uint8") - - try: - return self.values == val - except TypeError: - # e.g. if __eq__ returns a BooleanArray instead of ndarry[bool] - try: - return (self.values == val).to_numpy(dtype=bool, na_value=False) - except (TypeError, AttributeError) as err: - # e.g. (self.values == val) returned a bool - # see test_get_loc_generator[string[pyarrow]] - # e.g. self.value == val raises TypeError bc generator has no len - # see test_get_loc_generator[string[python]] - raise KeyError from err - - cdef _check_type(self, object val): - hash(val) - - -cdef class NullableEngine(SharedEngine): - - cdef readonly: - ndarray _values, _mask - bint has_missing - - def __init__(self, values: "MaskedArray"): - self.values = values - - self._values = values._data - self._mask = values._mask - - self.has_missing = values._mask.any() - - self.over_size_threshold = len(values) >= _SIZE_CUTOFF - self.need_unique_check = True - - cdef _do_monotonic_check(self): - cdef: - bint is_unique - - if self.has_missing: - self.monotonic_inc = 0 - self.monotonic_dec = 0 - self.need_monotonic_check = 0 - return - - # If there are no missing, then we can just look at self._values - try: - self.monotonic_inc, self.monotonic_dec, is_unique = \ - self._call_monotonic(self._values) - except TypeError: - self.monotonic_inc = 0 - self.monotonic_dec = 0 - is_unique = 0 - - self.need_monotonic_check = 0 - - # we can only be sure of uniqueness if is_unique=1 - if is_unique: - self.unique = 1 - self.need_unique_check = 0 - - cpdef get_loc(self, object val): - # -> Py_ssize_t | slice | ndarray[bool] - cdef: - Py_ssize_t loc - - if val is NA: - # TODO: return copy? readonly view? - # TODO: do this later on to keep same pattern as IndexEngine? - if not self.has_missing: - raise KeyError(val) - return _unpack_bool_indexer(self._mask, val) - - return SharedEngine.get_loc(self, val) - - cdef ndarray _get_bool_indexer(self, val): - if val is NA: - # TODO: KeyError(val) if not has_missing? - # TODO: readonly? copy? - return self._mask.view("uint8") - - if util.is_nan(val): - res = np.isnan(self._values) - else: - res = self._values == val - - res[self._mask] = False - return res - - cdef _check_type(self, object val): - kind = self._values.dtype.kind - if kind in ["i", "u"]: - if not util.is_integer_object(val): - raise KeyError(val) - if kind == "u": - if val < 0: - # cannot have negative values with unsigned int dtype - raise KeyError(val) - elif kind == "b": - if not util.is_bool_object(val): - raise KeyError(val) - else: - if not util.is_integer_object(val) and not util.is_float_object(val): - # in particular catch bool and avoid casting True -> 1.0 - raise KeyError(val) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0fa17e4f69d9a..f66489d467032 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -834,16 +834,18 @@ def _cleanup(self) -> None: @cache_readonly def _engine( self, - ) -> libindex.IndexEngine | libindex.NullableEngine | libindex.ExtensionEngine: + ) -> libindex.IndexEngine: # For base class (object dtype) we get ObjectEngine if isinstance(self._values, BaseMaskedArray): - return libindex.NullableEngine(self._values) + # TODO(ExtensionIndex): use libindex.NullableEngine(self._values) + return libindex.ObjectEngine(self._get_engine_target()) elif ( isinstance(self._values, ExtensionArray) and self._engine_type is libindex.ObjectEngine ): - return libindex.ExtensionEngine(self._values) + # TODO(ExtensionIndex): use libindex.ExtensionEngine(self._values) + return libindex.ObjectEngine(self._get_engine_target()) # to avoid a reference cycle, bind `target_values` to a local variable, so # `self` is not passed into the lambda. @@ -3209,10 +3211,13 @@ def _wrap_setop_result(self, other: Index, result) -> Index: name = get_op_result_name(self, other) if isinstance(result, Index): if result.name != name: - return result.rename(name) - return result + result = result.rename(name) else: - return self._shallow_copy(result, name=name) + result = self._shallow_copy(result, name=name) + + # TODO(ExtensionIndex): revert this astype; it is a kludge to make + # it possible to split ExtensionEngine from ExtensionIndex PR. + return result.astype(self.dtype, copy=False) # TODO: standardize return type of non-union setops type(self vs other) @final @@ -4782,6 +4787,9 @@ def _get_engine_target(self) -> np.ndarray: """ # error: Incompatible return value type (got "Union[ExtensionArray, # ndarray]", expected "ndarray") + if type(self) is Index and isinstance(self._values, ExtensionArray): + # TODO(ExtensionIndex): remove special-case, just use self._values + return self._values.astype(object) return self._values # type: ignore[return-value] def _from_join_target(self, result: np.ndarray) -> ArrayLike: From c8072c5281a09728334ebd2648d4c673067b34fe Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 21 Dec 2021 15:39:23 -0800 Subject: [PATCH 50/57] revert --- pandas/_libs/index.pyi | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 68163c1284b30..446a980487cde 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -1,17 +1,9 @@ -from typing import TYPE_CHECKING - import numpy as np from pandas._typing import npt from pandas import MultiIndex -if TYPE_CHECKING: - from pandas.core.arrays import ( - BaseMaskedArray, - ExtensionArray, - ) - class IndexEngine: over_size_threshold: bool def __init__(self, values: np.ndarray): ... From 7e0ac18849ce1cf74c5902773f0a76527952a024 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 21 Dec 2021 15:44:36 -0800 Subject: [PATCH 51/57] remove no-longer-necessary --- pandas/core/indexes/base.py | 3 +-- pandas/tests/base/test_misc.py | 8 +------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 15cafce537e17..aebe2f1b36d26 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6050,8 +6050,7 @@ def map(self, mapper, na_action=None): new_values, dtype=dtype, copy=False, name=self.name ) - result = Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name) - return result + return Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name) # TODO: De-duplicate with map, xref GH#32349 @final diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 26d6ed172e78d..0c448ae3ee57c 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -156,13 +156,7 @@ def test_access_by_position(index_flat): assert index[-1] == index[size - 1] msg = f"index {size} is out of bounds for axis 0 with size {size}" - try: - eq = is_dtype_equal(index.dtype, "string[pyarrow]") - except ImportError: - # TODO(GH#44276) is_dtype_equal can raise here - eq = False - - if eq: + if is_dtype_equal(index.dtype, "string[pyarrow]"): msg = "index out of bounds" with pytest.raises(IndexError, match=msg): index[size] From 80453b45cddecd3577d28b188386b66b07650355 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 21 Dec 2021 15:52:53 -0800 Subject: [PATCH 52/57] whatsnew --- doc/source/whatsnew/v1.4.0.rst | 37 ++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2592be9c4a350..df847fdcb08b0 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -90,6 +90,43 @@ be removed in the future, see :ref:`here ` for more about :class:`NumericIndex`. + +.. _whatsnew_140.enhancements.ExtensionIndex: + +Index can hold arbitrary ExtensionArrays +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Until now, passing a custom :class:`ExtensionArray` to ``pd.Index`` would cast the +array to ``object`` dtype. Now :class:`Index` can directly hold arbitrary ExtensionArrays (:issue:`43930`). + +*Previous behavior*: + +.. ipython:: python + + arr = pd.array([1, 2, pd.NA]) + idx = pd.Index(arr) + +In the old behavior, ``idx`` would be object-dtype: + +*Previous behavior*: + +.. code-block:: ipython + + In [1]: idx + Out[1]: Index([1, 2, ], dtype='object') + +With the new behavior, we keep the original dtype: + +*New behavior*: + +.. ipython:: python + + idx + +One exception to this is ``SparseArray``, which will continue to cast to numpy +dtype until pandas 2.0. At that point it will retain its dtype like other +ExtensionArrays. + .. _whatsnew_140.enhancements.styler: Styler From 7231a9e71d6b29f47f913ea0ce5c4de9f452dec7 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 21 Dec 2021 16:02:43 -0800 Subject: [PATCH 53/57] deprecation for SparseArray --- pandas/core/indexes/base.py | 8 ++++++++ pandas/tests/extension/test_sparse.py | 15 ++++++++++++--- .../tests/indexes/datetimes/test_constructors.py | 4 +++- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index aebe2f1b36d26..825f80590c0ce 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -570,6 +570,14 @@ def _dtype_to_subclass(cls, dtype: DtypeObj): return PeriodIndex elif isinstance(dtype, SparseDtype): + warnings.warn( + "In a future version, passing a SparseArray to pd.Index " + "will store that array directly instead of converting to a " + "dense numpy ndarray. To retain the old behavior, use " + "pd.Index(arr.to_numpy()) instead", + FutureWarning, + stacklevel=find_stack_level(), + ) return cls._dtype_to_subclass(dtype.subtype) return Index diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index d22c70e86ff74..ea715f4fac6db 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -193,9 +193,10 @@ def test_reindex(self, data, na_value): class TestIndex(base.BaseIndexTests): def test_index_from_array(self, data): - idx = pd.Index(data) - # TODO do we want to preserve the sparse dtype in the index - # now this is possible? + msg = "will store that array directly" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx = pd.Index(data) + if data.dtype.subtype == "f": assert idx.dtype == np.float64 elif data.dtype.subtype == "i": @@ -274,6 +275,14 @@ def test_fillna_frame(self, data_missing): class TestMethods(BaseSparseTests, base.BaseMethodsTests): + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values_frame(self, data_for_sorting, ascending): + msg = "will store that array directly" + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): + super().test_sort_values_frame(data_for_sorting, ascending) + def test_combine_le(self, data_repeated): # We return a Series[SparseArray].__le__ returns a # Series[Sparse[bool]] diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index b27c5852cb97b..b1e764ceb7009 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -123,7 +123,9 @@ def test_constructor_from_sparse_array(self): Timestamp("2016-05-01T01:00:00.000000"), ] arr = pd.arrays.SparseArray(values) - result = Index(arr) + msg = "will store that array directly" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = Index(arr) expected = DatetimeIndex(values) tm.assert_index_equal(result, expected) From f78aa0f8f69f96e6da769ac1a2e3962462208d05 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 21 Dec 2021 16:28:28 -0800 Subject: [PATCH 54/57] share _na_value method --- pandas/core/indexes/base.py | 8 ++++++-- pandas/core/indexes/datetimelike.py | 4 ---- pandas/tests/indexes/datetimelike_/test_nat.py | 1 - 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 825f80590c0ce..5698456c22f01 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -23,6 +23,7 @@ from pandas._config import get_option from pandas._libs import ( + NaT, algos as libalgos, index as libindex, lib, @@ -2616,9 +2617,12 @@ def __reduce__(self): @cache_readonly def _na_value(self): """The expected NA value to use with this index.""" - if isinstance(self.dtype, np.dtype): + dtype = self.dtype + if isinstance(dtype, np.dtype): + if dtype.kind in ["m", "M"]: + return NaT return np.nan - return self.dtype.na_value + return dtype.na_value @cache_readonly def _isnan(self) -> npt.NDArray[np.bool_]: diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 731efdc3b17f0..589b92f392ca8 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -24,7 +24,6 @@ ) from pandas._libs.tslibs import ( BaseOffset, - NaTType, Resolution, Tick, parsing, @@ -154,9 +153,6 @@ def __contains__(self, key: Any) -> bool: _can_hold_na = True - _na_value: NaTType = NaT - """The expected NA value to use with this index.""" - def _convert_tolerance(self, tolerance, target): tolerance = np.asarray(to_timedelta(tolerance).to_numpy()) return super()._convert_tolerance(tolerance, target) diff --git a/pandas/tests/indexes/datetimelike_/test_nat.py b/pandas/tests/indexes/datetimelike_/test_nat.py index b4a72ec65bd91..50cf29d016355 100644 --- a/pandas/tests/indexes/datetimelike_/test_nat.py +++ b/pandas/tests/indexes/datetimelike_/test_nat.py @@ -17,7 +17,6 @@ def test_nat(self, index_without_na): index_with_na = index_without_na.copy(deep=True) index_with_na._data[1] = NaT - assert type(index_without_na)._na_value is NaT assert empty_index._na_value is NaT assert index_with_na._na_value is NaT assert index_without_na._na_value is NaT From 453d6aec93bdc35494472aa802804c108826b606 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 22 Dec 2021 09:51:02 -0800 Subject: [PATCH 55/57] mypy fixup, npdev catch warnings --- pandas/core/indexes/base.py | 8 +++++--- pandas/tests/base/test_conversion.py | 12 ++++++++++-- pandas/tests/series/test_ufunc.py | 11 +++++++++-- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5698456c22f01..bbcc40fa78d49 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3768,7 +3768,7 @@ def _get_indexer( if target._is_multi and self._is_multi: engine = self._engine # error: "IndexEngine" has no attribute "_extract_level_codes" - tgt_values = engine._extract_level_codes( # type: ignore[union-attr] + tgt_values = engine._extract_level_codes( # type: ignore[attr-defined] target ) @@ -3848,7 +3848,7 @@ def _get_fill_indexer( # TODO: get_indexer_with_fill docstring says values must be _sorted_ # but that doesn't appear to be enforced # error: "IndexEngine" has no attribute "get_indexer_with_fill" - return self._engine.get_indexer_with_fill( # type: ignore[union-attr] + return self._engine.get_indexer_with_fill( # type: ignore[attr-defined] target=target._values, values=self._values, method=method, limit=limit ) @@ -5692,7 +5692,9 @@ def get_indexer_non_unique( if self._is_multi and target._is_multi: engine = self._engine # error: "IndexEngine" has no attribute "_extract_level_codes" - tgt_values = engine._extract_level_codes(target) # type: ignore[union-attr] + tgt_values = engine._extract_level_codes( + target # type: ignore[attr-defined] + ) indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return ensure_platform_int(indexer), ensure_platform_int(missing) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 49afe2859098d..84e4992cce0e3 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -256,11 +256,14 @@ def test_numpy_array_all_dtypes(any_numpy_dtype): ) def test_array(arr, attr, index_or_series, request): box = index_or_series + warn = None if arr.dtype.name in ("Sparse[int64, 0]") and box is pd.Index: mark = pytest.mark.xfail(reason="Index cannot yet store sparse dtype") request.node.add_marker(mark) + warn = FutureWarning - result = box(arr, copy=False).array + with tm.assert_produces_warning(warn): + result = box(arr, copy=False).array if attr: arr = getattr(arr, attr) @@ -330,7 +333,12 @@ def test_array_multiindex_raises(): ) def test_to_numpy(arr, expected, index_or_series_or_array, request): box = index_or_series_or_array - thing = box(arr) + + warn = None + if index_or_series_or_array is pd.Index and isinstance(arr, SparseArray): + warn = FutureWarning + with tm.assert_produces_warning(warn): + thing = box(arr) if arr.dtype.name == "int64" and box is pd.array: mark = pytest.mark.xfail(reason="thing is Int64 and to_numpy() returns object") diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 23c432b2d10bf..d4e528789dedb 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -85,7 +85,10 @@ def test_binary_ufunc_with_index(flip, sparse, ufunc, arrays_for_binary_ufunc): name = "name" # op(pd.Series, array) preserves the name. series = pd.Series(a1, name=name) - other = pd.Index(a2, name=name).astype("int64") + + warn = None if not sparse else FutureWarning + with tm.assert_produces_warning(warn): + other = pd.Index(a2, name=name).astype("int64") array_args = (a1, a2) series_args = (series, other) # ufunc(series, array) @@ -275,7 +278,11 @@ def test_reduce(values, box, request): # ATM Index casts to object, so we get python ints/floats same_type = False - obj = box(values) + warn = None + if values.dtype == "Sparse[int]" and box is pd.Index: + warn = FutureWarning + with tm.assert_produces_warning(warn): + obj = box(values) result = np.maximum.reduce(obj) expected = values[1] From 8750248c9bf967620d434f57c1a232069ba3d5c1 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 22 Dec 2021 10:37:51 -0800 Subject: [PATCH 56/57] mypy fixup --- pandas/core/indexes/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bbcc40fa78d49..dff6b58437be3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5692,8 +5692,8 @@ def get_indexer_non_unique( if self._is_multi and target._is_multi: engine = self._engine # error: "IndexEngine" has no attribute "_extract_level_codes" - tgt_values = engine._extract_level_codes( - target # type: ignore[attr-defined] + tgt_values = engine._extract_level_codes( # type: ignore[attr-defined] + target ) indexer, missing = self._engine.get_indexer_non_unique(tgt_values) From d2e0266363ad2e5ffe89670ad19c60beca0f5e37 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 22 Dec 2021 17:00:44 -0800 Subject: [PATCH 57/57] compat for older numpy --- pandas/tests/series/test_ufunc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index d4e528789dedb..dd0d999d438fa 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_dtype_equal + import pandas as pd import pandas._testing as tm from pandas.arrays import SparseArray @@ -279,7 +281,7 @@ def test_reduce(values, box, request): same_type = False warn = None - if values.dtype == "Sparse[int]" and box is pd.Index: + if is_dtype_equal(values.dtype, "Sparse[int]") and box is pd.Index: warn = FutureWarning with tm.assert_produces_warning(warn): obj = box(values)