From df9c2284c9bc2b5be943a21d993378d0e3dc226f Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 8 Oct 2021 09:54:47 -0700
Subject: [PATCH 01/57] ENH/WIP/POC: EA-backed Index

---
 pandas/_libs/index.pyx                        | 473 ++++++++++++++++++
 pandas/conftest.py                            |   8 +
 pandas/core/arrays/base.py                    |  19 +
 pandas/core/arrays/masked.py                  |  13 +-
 pandas/core/indexes/base.py                   |  35 +-
 pandas/core/indexes/interval.py               |   7 +-
 pandas/tests/arithmetic/test_numeric.py       |   4 +-
 .../arrays/categorical/test_constructors.py   |   3 +-
 pandas/tests/arrays/integer/test_dtypes.py    |   3 +-
 pandas/tests/base/test_value_counts.py        |  11 +
 pandas/tests/extension/base/ea_index.py       |  11 +
 pandas/tests/frame/test_reductions.py         |   2 +-
 pandas/tests/groupby/test_function.py         |   4 +-
 pandas/tests/groupby/test_quantile.py         |  16 +-
 pandas/tests/indexes/common.py                |   3 +-
 pandas/tests/indexes/test_numpy_compat.py     |  22 +-
 pandas/tests/indexing/test_indexing.py        |   3 +
 pandas/tests/series/methods/test_astype.py    |  10 +-
 pandas/tests/strings/test_extract.py          |   1 +
 19 files changed, 613 insertions(+), 35 deletions(-)
 create mode 100644 pandas/tests/extension/base/ea_index.py

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index ea0bebea8299b..fc6de84631d72 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -34,6 +34,7 @@ from pandas._libs import (
 )
 
 from pandas._libs.missing cimport (
+    C_NA as NA,
     checknull,
     is_matching_na,
 )
@@ -765,3 +766,475 @@ cdef class BaseMultiIndexCodesEngine:
 
 # Generated from template.
 include "index_class_helper.pxi"
+
+
+@cython.freelist(32)
+cdef class ExtensionEngine:
+    cdef readonly:
+        object values  # ExtensionArray
+        bint over_size_threshold
+
+    cdef:
+        bint unique, monotonic_inc, monotonic_dec
+        bint need_monotonic_check, need_unique_check
+
+    def __init__(self, values: "ExtensionArray"):
+        self.values = values
+
+        self.over_size_threshold = len(values) >= _SIZE_CUTOFF
+        self.need_unique_check = True
+        self.need_monotonic_check = True
+        self.need_unique_check = True
+
+    def clear_mapping(self):
+        # for compat with IndexEngine
+        pass
+
+    @property
+    def is_unique(self) -> bool:
+        if self.need_unique_check:
+            arr = self.values.unique()
+            self.unique = len(arr) == len(self.values)
+
+            self.need_unique_check = False
+        return self.unique
+
+    @property
+    def is_monotonic_increasing(self) -> bool:
+        if self.need_monotonic_check:
+            self._do_monotonic_check()
+
+        return self.monotonic_inc == 1
+
+    @property
+    def is_monotonic_decreasing(self) -> bool:
+        if self.need_monotonic_check:
+            self._do_monotonic_check()
+
+        return self.monotonic_dec == 1
+
+    cdef inline _do_monotonic_check(self):
+        cdef:
+            bint is_unique
+
+        # FIXME: shouldn't depend on non-required _values_for_argsort
+        try:
+            self.monotonic_inc, self.monotonic_dec, is_unique = \
+                self._call_monotonic(self.values._values_for_argsort())
+        except TypeError:
+            self.monotonic_inc = 0
+            self.monotonic_dec = 0
+            is_unique = 0
+
+        self.need_monotonic_check = 0
+
+        # we can only be sure of uniqueness if is_unique=1
+        if is_unique:
+            self.unique = 1
+            self.need_unique_check = 0
+
+    cdef _call_monotonic(self, values):
+        return algos.is_monotonic(values, timelike=False)
+
+    def __contains__(self, val: object) -> bool:
+        # We assume before we get here:
+        #  - val is hashable
+        try:
+            self.get_loc(val)
+            return True
+        except KeyError:
+            return False
+
+    cpdef get_loc(self, object val):
+        # -> Py_ssize_t | slice | ndarray[bool]
+        cdef:
+            Py_ssize_t loc
+
+        if is_definitely_invalid_key(val):
+            raise TypeError(f"'{val}' is an invalid key")
+
+        self._check_type(val)
+
+        if self.over_size_threshold and self.is_monotonic_increasing:
+            if not self.is_unique:
+                return self._get_loc_duplicates(val)
+
+            values = self.values
+
+            loc = self._searchsorted_left(val)
+            if loc >= len(values):
+                raise KeyError(val)
+            if values[loc] != val:
+                raise KeyError(val)
+            return loc
+
+        if not self.unique:
+            return self._get_loc_duplicates(val)
+
+        return self._get_loc_duplicates(val)
+
+    cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
+        """
+        See ObjectEngine._searchsorted_left.__doc__.
+        """
+        try:
+            loc = self.values.searchsorted(val, side="left")
+        except TypeError as err:
+            # GH#35788 e.g. val=None with float64 values
+            raise KeyError(val)
+        return loc
+
+    cdef inline _get_loc_duplicates(self, object val):
+        # -> Py_ssize_t | slice | ndarray[bool]
+        cdef:
+            Py_ssize_t diff
+
+        if self.is_monotonic_increasing:
+            values = self.values
+            try:
+                left = values.searchsorted(val, side='left')
+                right = values.searchsorted(val, side='right')
+            except TypeError:
+                # e.g. GH#29189 get_loc(None) with a Float64Index
+                raise KeyError(val)
+
+            diff = right - left
+            if diff == 0:
+                raise KeyError(val)
+            elif diff == 1:
+                return left
+            else:
+                return slice(left, right)
+
+        return self._maybe_get_bool_indexer(val)
+
+    cdef _get_bool_indexer(self, val):
+        if checknull(val):
+            return self.values.isna()  # FIXME: need to check for *matching* NA
+
+        return self.values == val
+
+    cdef _maybe_get_bool_indexer(self, object val):
+        # Returns ndarray[bool] or int
+        cdef:
+            ndarray[uint8_t, ndim=1, cast=True] indexer
+
+        indexer = _get_bool_indexer(self.values, val)
+        return _unpack_bool_indexer(indexer, val)
+
+    def sizeof(self, deep: bool = False) -> int:
+        """ return the sizeof our mapping """
+        return 0
+
+    def __sizeof__(self) -> int:
+        return self.sizeof()
+
+    cdef _check_type(self, object val):
+        hash(val)
+
+    def get_indexer(self, values: "ExtensionArray") -> np.ndarray:
+        # Note: we only get here with self.is_unique
+        cdef:
+            Py_ssize_t i, N = len(values)
+
+        res = np.empty(N, dtype=np.intp)
+
+        for i in range(N):
+            val = values[i]
+            try:
+                loc = self.get_loc(val)
+                # Because we are unique, loc should always be an integer
+            except KeyError:
+                loc = -1
+            res[i] = loc
+
+        return res
+
+    def get_indexer_non_unique(self, targets: "ExtensionArray"):
+        """
+        Return an indexer suitable for taking from a non unique index
+        return the labels in the same order as the target
+        and a missing indexer into the targets (which correspond
+        to the -1 indices in the results
+
+        Returns
+        -------
+        indexer : np.ndarray[np.intp]
+        missing : np.ndarray[np.intp]
+        """
+        cdef:
+            Py_ssize_t i, N = len(targets)
+
+        indexer = []
+        missing = []
+
+        # See also IntervalIndex.get_indexer_pointwise
+        for i in range(N):
+            val = targets[i]
+
+            try:
+                locs = self.get_loc(val)
+            except KeyError:
+                locs = np.array([-1], dtype=np.intp)
+                missing.append(i)
+            else:
+                if isinstance(locs, slice):
+                    # Only needed for get_indexer_non_unique
+                    locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp)
+                elif util.is_integer_object(locs):
+                    locs = np.array([locs], dtype=np.intp)
+                else:
+                    assert locs.dtype.kind == "b"
+                    locs = locs.nonzero()[0]
+
+            indexer.append(locs)
+
+        indexer = np.concatenate(indexer, dtype=np.intp)
+        missing = np.array(missing, dtype=np.intp)
+
+        return indexer, missing
+
+
+@cython.freelist(32)
+cdef class NullableEngine:
+
+    cdef readonly:
+        ndarray _values, _mask
+        bint over_size_threshold
+        bint has_missing
+        object values  # MaskedArray
+
+    cdef:
+        bint unique, monotonic_inc, monotonic_dec
+        bint need_monotonic_check, need_unique_check
+
+    def __init__(self, values: "MaskedArray"):
+        self.values = values
+
+        self._values = values._data
+        self._mask = values._mask
+
+        self.has_missing = values._mask.any()
+
+        self.over_size_threshold = len(values) >= _SIZE_CUTOFF
+        self.need_unique_check = True
+
+    def clear_mapping(self):
+        # for compat with IndexEngine
+        pass
+
+    @property
+    def is_unique(self) -> bool:
+        if self.need_unique_check:
+            arr = self.values.unique()
+            self.unique = len(arr) == len(self.values)
+
+            self.need_unique_check = False
+        return self.unique
+
+    @property
+    def is_monotonic_increasing(self) -> bool:
+        if self.need_monotonic_check:
+            self._do_monotonic_check()
+
+        return self.monotonic_inc == 1
+
+    @property
+    def is_monotonic_decreasing(self) -> bool:
+        if self.need_monotonic_check:
+            self._do_monotonic_check()
+
+        return self.monotonic_dec == 1
+
+    cdef inline _do_monotonic_check(self):
+        cdef:
+            bint is_unique
+
+        if self.has_missing:
+            self.monotonic_inc = 0
+            self.monotonic_dec = 0
+            self.need_monotonic_check = 0
+            return
+
+        # If there are no missing, then we can just look at self._values
+        try:
+            self.monotonic_inc, self.monotonic_dec, is_unique = \
+                self._call_monotonic(self._values)
+        except TypeError:
+            self.monotonic_inc = 0
+            self.monotonic_dec = 0
+            is_unique = 0
+
+        self.need_monotonic_check = 0
+
+        # we can only be sure of uniqueness if is_unique=1
+        if is_unique:
+            self.unique = 1
+            self.need_unique_check = 0
+
+    cdef _call_monotonic(self, values):
+        return algos.is_monotonic(values, timelike=False)
+
+    def __contains__(self, val: object) -> bool:
+        # We assume before we get here:
+        #  - val is hashable
+        try:
+            self.get_loc(val)
+            return True
+        except KeyError:
+            return False
+
+    cpdef get_loc(self, object val):
+        # -> Py_ssize_t | slice | ndarray[bool]
+        cdef:
+            Py_ssize_t loc
+
+        if is_definitely_invalid_key(val):
+            raise TypeError(f"'{val}' is an invalid key")
+
+        self._check_type(val)
+
+        if self.over_size_threshold and self.is_monotonic_increasing:
+            if not self.is_unique:
+                return self._get_loc_duplicates(val)
+
+            values = self.values
+
+            loc = self._searchsorted_left(val)
+            if loc >= len(values):
+                raise KeyError(val)
+            if values[loc] != val:
+                raise KeyError(val)
+            return loc
+
+        if not self.unique:
+            return self._get_loc_duplicates(val)
+
+        return self._get_loc_duplicates(val)
+
+    cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
+        """
+        See ObjectEngine._searchsorted_left.__doc__.
+        """
+        try:
+            loc = self.values.searchsorted(val, side="left")
+        except TypeError as err:
+            # GH#35788 e.g. val=None with float64 values
+            raise KeyError(val)
+        return loc
+
+    cdef inline _get_loc_duplicates(self, object val):
+        # -> Py_ssize_t | slice | ndarray[bool]
+        cdef:
+            Py_ssize_t diff
+
+        if self.is_monotonic_increasing:
+            values = self.values
+            try:
+                left = values.searchsorted(val, side='left')
+                right = values.searchsorted(val, side='right')
+            except TypeError:
+                # e.g. GH#29189 get_loc(None) with a Float64Index
+                raise KeyError(val)
+
+            diff = right - left
+            if diff == 0:
+                raise KeyError(val)
+            elif diff == 1:
+                return left
+            else:
+                return slice(left, right)
+
+        return self._maybe_get_bool_indexer(val)
+
+    cdef _get_bool_indexer(self, val):
+        if val is NA:
+            return self._mask
+
+        if util.is_nan(val):
+            res = np.isnan(self._values)
+        else:
+            res = self._values == val
+
+        res[self._mask] = False
+        return res
+
+    cdef _maybe_get_bool_indexer(self, object val):
+        # Returns ndarray[bool] or int
+        cdef:
+            ndarray[uint8_t, ndim=1, cast=True] indexer
+
+        indexer = self._get_bool_indexer(val)
+        return _unpack_bool_indexer(indexer, val)
+
+    def sizeof(self, deep: bool = False) -> int:
+        """ return the sizeof our mapping """
+        return 0
+
+    def __sizeof__(self) -> int:
+        return self.sizeof()
+
+    cdef _check_type(self, object val):
+        hash(val)
+
+    def get_indexer(self, values: "MaskedArray") -> np.ndarray:
+        # Note: we only get here with self.is_unique
+        cdef:
+            Py_ssize_t i, N = len(values)
+
+        res = np.empty(N, dtype=np.intp)
+
+        for i in range(N):
+            val = values[i]
+            try:
+                loc = self.get_loc(val)
+                # Because we are unique, loc should always be an integer
+            except KeyError:
+                loc = -1
+            res[i] = loc
+
+        return res
+
+    def get_indexer_non_unique(self, targets: "MaskedArray"):
+        """
+        Return an indexer suitable for taking from a non unique index
+        return the labels in the same order as the target
+        and a missing indexer into the targets (which correspond
+        to the -1 indices in the results
+
+        Returns
+        -------
+        indexer : np.ndarray[np.intp]
+        missing : np.ndarray[np.intp]
+        """
+        cdef:
+            Py_ssize_t i, N = len(targets)
+
+        indexer = []
+        missing = []
+
+        # See also IntervalIndex.get_indexer_pointwise
+        for i in range(N):
+            val = targets[i]
+
+            try:
+                locs = self.get_loc(val)
+            except KeyError:
+                locs = np.array([-1], dtype=np.intp)
+                missing.append(i)
+            else:
+                if isinstance(locs, slice):
+                    # Only needed for get_indexer_non_unique
+                    locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp)
+                elif util.is_integer_object(locs):
+                    locs = np.array([locs], dtype=np.intp)
+                else:
+                    assert locs.dtype.kind == "b"
+                    locs = locs.nonzero()[0]
+
+            indexer.append(locs)
+
+        indexer = np.concatenate(indexer, dtype=np.intp)
+        missing = np.array(missing, dtype=np.intp)
+
+        return indexer, missing
diff --git a/pandas/conftest.py b/pandas/conftest.py
index 44b805c632723..3428c6a65da79 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -498,6 +498,14 @@ def _create_mi_with_dt64tz_level():
     "mi-with-dt64tz-level": _create_mi_with_dt64tz_level(),
     "multi": _create_multiindex(),
     "repeats": Index([0, 0, 1, 1, 2, 2]),
+    "nullable_int": Index(np.arange(100), dtype="Int64"),
+    "nullable_float": Index(np.arange(100), dtype="Float32"),
+    "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"),
+    "nullable_int-na": Index(np.arange(100), dtype="Int64").insert(1, pd.NA),
+    "nullable_float-na": Index(np.arange(100), dtype="Float32").insert(1, pd.NA),
+    "nullable_bool-na": Index(np.arange(100).astype(bool), dtype="boolean").insert(
+        1, pd.NA
+    ),
 }
 
 
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 46b0a6873986e..91ed9466bfb69 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -1348,6 +1348,25 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
     # ------------------------------------------------------------------------
     # Non-Optimized Default Methods
 
+    def putmask(self, mask: np.ndarray, value) -> None:
+        """
+        Analogue to np.putmask(self, mask, value)
+
+        Parameters
+        ----------
+        mask : np.ndarray[bool]
+        value : scalar or listlike
+
+        Raises
+        ------
+        TypeError
+            If value cannot be inserted into self.
+        """
+        if not is_list_like(value):
+            self[mask] = value
+        else:
+            self[mask] = value[mask]
+
     def delete(self: ExtensionArrayT, loc: PositionalIndexer) -> ExtensionArrayT:
         indexer = np.delete(np.arange(len(self)), loc)
         return self.take(indexer)
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 6a03456673604..3737719460431 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -564,7 +564,7 @@ def value_counts(self, dropna: bool = True) -> Series:
         # TODO(extension)
         # if we have allow Index to hold an ExtensionArray
         # this is easier
-        index = value_counts.index._values.astype(object)
+        index = value_counts.index  # ._values.astype(object)
 
         # if we want nans, count the mask
         if dropna:
@@ -574,10 +574,13 @@ def value_counts(self, dropna: bool = True) -> Series:
             counts[:-1] = value_counts
             counts[-1] = self._mask.sum()
 
-            index = Index(
-                np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]),
-                dtype=object,
-            )
+            index = index.insert(-1, self.dtype.na_value)
+            # index = Index(
+            #    np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]),
+            #    dtype=object,
+            # )
+
+        index = index.astype(self.dtype)
 
         mask = np.zeros(len(counts), dtype="bool")
         counts = IntegerArray(counts, mask)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index da953fe46ef1d..d2d9c70404367 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -135,6 +135,7 @@
     tz_to_dtype,
     validate_tz_from_dtype,
 )
+from pandas.core.arrays.masked import BaseMaskedArray
 from pandas.core.arrays.sparse import SparseDtype
 from pandas.core.base import (
     IndexOpsMixin,
@@ -410,8 +411,9 @@ def __new__(
             validate_tz_from_dtype(dtype, tz)
             dtype = tz_to_dtype(tz)
 
-        if isinstance(data, PandasArray):
-            # ensure users don't accidentally put a PandasArray in an index.
+        if type(data) is PandasArray:
+            # ensure users don't accidentally put a PandasArray in an index,
+            #  but don't unpack StringArray
             data = data.to_numpy()
         if isinstance(dtype, PandasDtype):
             dtype = dtype.numpy_dtype
@@ -433,7 +435,6 @@ def __new__(
 
             ea_cls = dtype.construct_array_type()
             data = ea_cls._from_sequence(data, dtype=dtype, copy=copy)
-            data = np.asarray(data, dtype=object)
             disallow_kwargs(kwargs)
             return Index._simple_new(data, name=name)
 
@@ -445,8 +446,8 @@ def __new__(
                     return result.astype(dtype, copy=False)
                 return result
 
-            data = np.array(data, dtype=object, copy=copy)
             disallow_kwargs(kwargs)
+            data = extract_array(data, extract_numpy=True)
             return Index._simple_new(data, name=name)
 
         # index-like
@@ -640,7 +641,7 @@ def _simple_new(cls: type[_IndexT], values, name: Hashable = None) -> _IndexT:
 
         Must be careful not to recurse.
         """
-        assert isinstance(values, np.ndarray), type(values)
+        assert isinstance(values, (np.ndarray, ExtensionArray))
 
         result = object.__new__(cls)
         result._data = values
@@ -656,6 +657,7 @@ def _with_infer(cls, *args, **kwargs):
         Constructor that uses the 1.0.x behavior inferring numeric dtypes
         for ndarray[object] inputs.
         """
+
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore", ".*the Index constructor", FutureWarning)
             result = cls(*args, **kwargs)
@@ -811,6 +813,14 @@ def _cleanup(self) -> None:
     def _engine(self) -> libindex.IndexEngine:
         # For base class (object dtype) we get ObjectEngine
 
+        if isinstance(self._values, BaseMaskedArray):
+            return libindex.NullableEngine(self._values)
+        elif (
+            isinstance(self._values, ExtensionArray)
+            and self._engine_type is libindex.ObjectEngine
+        ):
+            return libindex.ExtensionEngine(self._values)
+
         # to avoid a reference cycle, bind `target_values` to a local variable, so
         # `self` is not passed into the lambda.
         target_values = self._get_engine_target()
@@ -1024,9 +1034,15 @@ def take(
 
         # Note: we discard fill_value and use self._na_value, only relevant
         #  in the case where allow_fill is True and fill_value is not None
-        taken = algos.take(
-            self._values, indices, allow_fill=allow_fill, fill_value=self._na_value
-        )
+        values = self._values
+        if isinstance(values, np.ndarray):
+            taken = algos.take(
+                values, indices, allow_fill=allow_fill, fill_value=self._na_value
+            )
+        else:
+            taken = values.take(
+                indices, allow_fill=allow_fill, fill_value=self._na_value
+            )
         return type(self)._simple_new(taken, name=self.name)
 
     @final
@@ -5012,6 +5028,9 @@ def equals(self, other: Any) -> bool:
             # d-level MultiIndex can equal d-tuple Index
             return other.equals(self)
 
+        if is_extension_array_dtype(self.dtype):
+            return self._values.equals(other._values)
+
         if is_extension_array_dtype(other.dtype):
             # All EA-backed Index subclasses override equals
             return other.equals(self)
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index 165048e2a591a..bfef143ed187b 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -725,7 +725,12 @@ def _get_indexer_pointwise(
                 if isinstance(locs, slice):
                     # Only needed for get_indexer_non_unique
                     locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp")
-                locs = np.array(locs, ndmin=1)
+                elif lib.is_integer(locs):
+                    locs = np.array(locs, ndmin=1)
+                else:
+                    # FIXME: This is wrong; its boolean; not reached
+                    assert locs.dtype.kind == "i"
+
             except KeyError:
                 missing.append(i)
                 locs = np.array([-1])
diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py
index 2cdc35bdf51cb..1a55e65aee93c 100644
--- a/pandas/tests/arithmetic/test_numeric.py
+++ b/pandas/tests/arithmetic/test_numeric.py
@@ -1401,9 +1401,9 @@ def test_integer_array_add_list_like(
     if Series == box_pandas_1d_array:
         expected = Series(expected_data, dtype="Int64")
     elif Series == box_1d_array:
-        expected = Series(expected_data, dtype="object")
+        expected = Series(expected_data, dtype="Int64")
     elif Index in (box_pandas_1d_array, box_1d_array):
-        expected = Int64Index(expected_data)
+        expected = Index(expected_data, dtype="Int64")
     else:
         expected = np.array(expected_data, dtype="object")
 
diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
index ee24ecb4964ec..05ea29081cc7c 100644
--- a/pandas/tests/arrays/categorical/test_constructors.py
+++ b/pandas/tests/arrays/categorical/test_constructors.py
@@ -728,7 +728,8 @@ def test_categorical_extension_array_nullable(self, nulls_fixture):
         # GH:
         arr = pd.arrays.StringArray._from_sequence([nulls_fixture] * 2)
         result = Categorical(arr)
-        expected = Categorical(Series([pd.NA, pd.NA], dtype="object"))
+        assert arr.dtype == result.categories.dtype
+        expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype))
         tm.assert_categorical_equal(result, expected)
 
     def test_from_sequence_copy(self):
diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py
index f8b1ea2ebde23..c71ba7209cc44 100644
--- a/pandas/tests/arrays/integer/test_dtypes.py
+++ b/pandas/tests/arrays/integer/test_dtypes.py
@@ -72,7 +72,8 @@ def test_construct_index(all_data, dropna):
         other = all_data
 
     result = pd.Index(pd.array(other, dtype=all_data.dtype))
-    expected = pd.Index(other, dtype=object)
+    expected = pd.Index(other, dtype=all_data.dtype)
+    assert all_data.dtype == expected.dtype  # dont coerce to object
 
     tm.assert_index_equal(result, expected)
 
diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
index 10f391a49d98f..47cd29c49000c 100644
--- a/pandas/tests/base/test_value_counts.py
+++ b/pandas/tests/base/test_value_counts.py
@@ -35,10 +35,16 @@ def test_value_counts(index_or_series_obj):
     if isinstance(obj, pd.MultiIndex):
         expected.index = Index(expected.index)
 
+    if not isinstance(result.dtype, np.dtype):
+        # TODO: be more specific
+        # i.e IntegerDtype
+        expected = expected.astype(result.dtype)
+
     # TODO: Order of entries with the same count is inconsistent on CI (gh-32449)
     if obj.duplicated().any():
         result = result.sort_index()
         expected = expected.sort_index()
+
     tm.assert_series_equal(result, expected)
 
 
@@ -76,6 +82,11 @@ def test_value_counts_null(null_obj, index_or_series_obj):
         #  Order of entries with the same count is inconsistent on CI (gh-32449)
         expected = expected.sort_index()
         result = result.sort_index()
+
+    if not isinstance(result.dtype, np.dtype):
+        # TODO: be more specific
+        # i.e IntegerDtype
+        expected = expected.astype(result.dtype)
     tm.assert_series_equal(result, expected)
 
     # can't use expected[null_obj] = 3 as
diff --git a/pandas/tests/extension/base/ea_index.py b/pandas/tests/extension/base/ea_index.py
new file mode 100644
index 0000000000000..8309842f7134f
--- /dev/null
+++ b/pandas/tests/extension/base/ea_index.py
@@ -0,0 +1,11 @@
+"""
+Tests for Indexes backed by arbitrary ExtensionArrays.
+"""
+import pandas as pd
+from pandas.tests.extension.base.base import BaseExtensionTests
+
+
+class BaseExtensionIndexTests(BaseExtensionTests):
+    def test_index_from_array(self, data):
+        idx = pd.Index(data)
+        assert data.dtype == idx.dtype
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
index 258e4e6eb0cc9..d30d810703594 100644
--- a/pandas/tests/frame/test_reductions.py
+++ b/pandas/tests/frame/test_reductions.py
@@ -1067,7 +1067,7 @@ def test_idxmax_idxmin_convert_dtypes(self, op, expected_value):
         result = getattr(df, op)()
         expected = DataFrame(
             {"value": expected_value},
-            index=Index([100, 200], name="ID"),
+            index=Index([100, 200], name="ID", dtype="Int64"),
         )
         tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index 3ae11847cc06b..edc366d2df4f4 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -1126,7 +1126,7 @@ def test_apply_to_nullable_integer_returns_float(values, function):
     # https://github.com/pandas-dev/pandas/issues/32219
     output = 0.5 if function == "var" else 1.5
     arr = np.array([output] * 3, dtype=float)
-    idx = Index([1, 2, 3], name="a")
+    idx = Index([1, 2, 3], name="a", dtype="Int64")
     expected = DataFrame({"b": arr}, index=idx).astype("Float64")
 
     groups = DataFrame(values, dtype="Int64").groupby("a")
@@ -1146,7 +1146,7 @@ def test_groupby_sum_below_mincount_nullable_integer():
     # https://github.com/pandas-dev/pandas/issues/32861
     df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64")
     grouped = df.groupby("a")
-    idx = Index([0, 1, 2], name="a")
+    idx = Index([0, 1, 2], name="a", dtype="Int64")
 
     result = grouped["b"].sum(min_count=2)
     expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b")
diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py
index bcb2abeed75e4..1badc4aa7995a 100644
--- a/pandas/tests/groupby/test_quantile.py
+++ b/pandas/tests/groupby/test_quantile.py
@@ -251,14 +251,14 @@ def test_groupby_quantile_NA_float(any_float_dtype):
     # GH#42849
     df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype)
     result = df.groupby("x")["y"].quantile(0.5)
-    expected = pd.Series([0.2], dtype=float, index=[1.0], name="y")
-    expected.index.name = "x"
+    exp_index = Index([1.0], dtype=any_float_dtype, name="x")
+    expected = pd.Series([0.2], dtype=float, index=exp_index, name="y")
     tm.assert_series_equal(expected, result)
 
     result = df.groupby("x")["y"].quantile([0.5, 0.75])
     expected = pd.Series(
         [0.2] * 2,
-        index=pd.MultiIndex.from_product(([1.0], [0.5, 0.75]), names=["x", None]),
+        index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]),
         name="y",
     )
     tm.assert_series_equal(result, expected)
@@ -268,11 +268,13 @@ def test_groupby_quantile_NA_int(any_int_ea_dtype):
     # GH#42849
     df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype)
     result = df.groupby("x")["y"].quantile(0.5)
-    expected = pd.Series([3.5], dtype=float, index=Index([1], name="x"), name="y")
+    expected = pd.Series(
+        [3.5], dtype=float, index=Index([1], name="x", dtype=any_int_ea_dtype), name="y"
+    )
     tm.assert_series_equal(expected, result)
 
     result = df.groupby("x").quantile(0.5)
-    expected = DataFrame({"y": 3.5}, index=Index([1], name="x"))
+    expected = DataFrame({"y": 3.5}, index=Index([1], name="x", dtype=any_int_ea_dtype))
     tm.assert_frame_equal(result, expected)
 
 
@@ -281,7 +283,9 @@ def test_groupby_quantile_allNA_column(dtype):
     # GH#42849
     df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype)
     result = df.groupby("x")["y"].quantile(0.5)
-    expected = pd.Series([np.nan], dtype=float, index=[1.0], name="y")
+    expected = pd.Series(
+        [np.nan], dtype=float, index=Index([1.0], dtype=dtype), name="y"
+    )
     expected.index.name = "x"
     tm.assert_series_equal(expected, result)
 
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index a8684ca4d3c25..6606bcd2e08f9 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -525,6 +525,7 @@ def test_format_empty(self):
         assert empty_idx.format() == []
         assert empty_idx.format(name=True) == [""]
 
+    # TODO: doesn't belong in this class, gets re-run for every subclas
     def test_hasnans_isnans(self, index_flat):
         # GH 11343, added tests for hasnans / isnans
         index = index_flat
@@ -547,7 +548,7 @@ def test_hasnans_isnans(self, index_flat):
         else:
             values[1] = np.nan
 
-        if isinstance(index, PeriodIndex):
+        if False:  # isinstance(index, PeriodIndex):
             idx = type(index)(values, freq=index.freq)
         else:
             idx = type(index)(values)
diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py
index 9cc1205310ea7..c4a0936d8d633 100644
--- a/pandas/tests/indexes/test_numpy_compat.py
+++ b/pandas/tests/indexes/test_numpy_compat.py
@@ -10,6 +10,7 @@
 )
 import pandas._testing as tm
 from pandas.core.api import Float64Index
+from pandas.core.arrays import BooleanArray
 from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
 
 
@@ -49,14 +50,21 @@ def test_numpy_ufuncs_basic(index, func):
         with tm.external_error_raised((TypeError, AttributeError)):
             with np.errstate(all="ignore"):
                 func(index)
-    elif isinstance(index, NumericIndex):
+    elif isinstance(index, NumericIndex) or (
+        not isinstance(index.dtype, np.dtype) and index.dtype._is_numeric
+    ):
         # coerces to float (e.g. np.sin)
         with np.errstate(all="ignore"):
             result = func(index)
             exp = Index(func(index.values), name=index.name)
 
         tm.assert_index_equal(result, exp)
-        assert isinstance(result, Float64Index)
+        if type(index) is not Index:
+            # i.e NumericIndex
+            assert isinstance(result, Float64Index)
+        else:
+            # e.g. np.exp with Int64 -> Float64
+            assert type(result) is Index
     else:
         # raise AttributeError or TypeError
         if len(index) == 0:
@@ -94,10 +102,16 @@ def test_numpy_ufuncs_other(index, func, request):
         with tm.external_error_raised(TypeError):
             func(index)
 
-    elif isinstance(index, NumericIndex):
+    elif isinstance(index, NumericIndex) or (
+        not isinstance(index.dtype, np.dtype) and index.dtype._is_numeric
+    ):
         # Results in bool array
         result = func(index)
-        assert isinstance(result, np.ndarray)
+        if not isinstance(index.dtype, np.dtype):
+            # e.g. Int64 we expect to get BooleanArray back
+            assert isinstance(result, BooleanArray)
+        else:
+            assert isinstance(result, np.ndarray)
         assert not isinstance(result, Index)
     else:
         if len(index) == 0:
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
index 01407f1f9bae7..d5c1d6f1533d3 100644
--- a/pandas/tests/indexing/test_indexing.py
+++ b/pandas/tests/indexing/test_indexing.py
@@ -99,6 +99,9 @@ def test_getitem_ndarray_3d(
             msgs.append("Data must be 1-dimensional")
         if len(index) == 0 or isinstance(index, pd.MultiIndex):
             msgs.append("positional indexers are out-of-bounds")
+        if type(index) is Index and not isinstance(index._values, np.ndarray):
+            # e.g. Int64
+            msgs.append("values must be a 1D array")
         msg = "|".join(msgs)
 
         potential_errors = (IndexError, ValueError, NotImplementedError)
diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
index 732d375d136d0..a9833e746ecb2 100644
--- a/pandas/tests/series/methods/test_astype.py
+++ b/pandas/tests/series/methods/test_astype.py
@@ -427,9 +427,13 @@ def test_astype_string_to_extension_dtype_roundtrip(
             )
             request.node.add_marker(mark)
         # GH-40351
-        s = Series(data, dtype=dtype)
-        result = s.astype(nullable_string_dtype).astype(dtype)
-        tm.assert_series_equal(result, s)
+        ser = Series(data, dtype=dtype)
+
+        # Note: just passing .astype(dtype) fails for dtype="category"
+        #  with bc ser.dtype.categories will be object dtype whereas
+        #  result.dtype.categories will have string dtype
+        result = ser.astype(nullable_string_dtype).astype(ser.dtype)
+        tm.assert_series_equal(result, ser)
 
 
 class TestAstypeCategorical:
diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py
index 16ec4a8c6831c..9d5380955acae 100644
--- a/pandas/tests/strings/test_extract.py
+++ b/pandas/tests/strings/test_extract.py
@@ -258,6 +258,7 @@ def test_extract_expand_True_single_capture_group(index_or_series, any_string_dt
     s_or_idx = index_or_series(["A1", "A2"], dtype=any_string_dtype)
     result = s_or_idx.str.extract(r"(?P<uno>A)\d", expand=True)
     expected_dtype = "object" if index_or_series is Index else any_string_dtype
+    expected_dtype = any_string_dtype
     expected = DataFrame({"uno": ["A", "A"]}, dtype=expected_dtype)
     tm.assert_frame_equal(result, expected)
 

From 95e012971a604f1aa759334bc71a43c5a8d75a57 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 8 Oct 2021 12:30:30 -0700
Subject: [PATCH 02/57] BUG: NumericIndex.insert

---
 pandas/core/indexes/base.py    |  7 ++++---
 pandas/tests/indexes/common.py | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index da953fe46ef1d..2ff9b3973a526 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -6329,10 +6329,11 @@ def insert(self, loc: int, item) -> Index:
 
         arr = np.asarray(self)
 
-        # Use Index constructor to ensure we get tuples cast correctly.
-        item = Index([item], dtype=self.dtype)._values
+        # Use constructor to ensure we get tuples cast correctly.
+        # Use self._constructor instead of Index to retain NumericIndex GH#43921
+        item = self._constructor([item], dtype=self.dtype)._values
         idx = np.concatenate((arr[:loc], item, arr[loc:]))
-        return Index._with_infer(idx, name=self.name)
+        return self._constructor._with_infer(idx, name=self.name)
 
     def drop(self, labels, errors: str_t = "raise") -> Index:
         """
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index 8357595fdaa40..7e43664c6b3de 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -793,6 +793,20 @@ def test_format(self, simple_index):
     def test_numeric_compat(self):
         pass  # override Base method
 
+    def test_insert_non_na(self, simple_index):
+        # GH#43921 inserting an element that we know we can hold should
+        #  not change dtype or type (except for RangeIndex)
+        index = simple_index
+
+        result = index.insert(0, index[0])
+
+        cls = type(index)
+        if cls is RangeIndex:
+            cls = Int64Index
+
+        expected = cls([index[0]] + list(index), dtype=index.dtype)
+        tm.assert_index_equal(result, expected)
+
     def test_insert_na(self, nulls_fixture, simple_index):
         # GH 18295 (test missing)
         index = simple_index
@@ -800,6 +814,11 @@ def test_insert_na(self, nulls_fixture, simple_index):
 
         if na_val is pd.NaT:
             expected = Index([index[0], pd.NaT] + list(index[1:]), dtype=object)
+        elif type(index) is NumericIndex and index.dtype.kind == "f":
+            # GH#43921
+            expected = NumericIndex(
+                [index[0], np.nan] + list(index[1:]), dtype=index.dtype
+            )
         else:
             expected = Float64Index([index[0], np.nan] + list(index[1:]))
 

From d53377d841d62860e6c216477304248ae8dbaba0 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sun, 10 Oct 2021 13:21:43 -0700
Subject: [PATCH 03/57] fix a few more tests; ignoring linting for now

---
 pandas/core/indexes/base.py            | 20 ++++++++++++++++++--
 pandas/core/indexing.py                |  2 +-
 pandas/tests/base/test_value_counts.py |  5 +++--
 pandas/tests/strings/test_extract.py   |  4 +---
 4 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 8456823cbacc6..19b44784c7f99 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -6316,7 +6316,12 @@ def delete(self: _IndexT, loc) -> _IndexT:
         >>> idx.delete([0, 2])
         Index(['b'], dtype='object')
         """
-        res_values = np.delete(self._data, loc)
+        values = self._values
+        if isinstance(values, np.ndarray):
+            res_values = np.delete(values, loc)
+        else:
+            # TODO(__array_function__) special-casing unnecessary
+            res_values = values.delete(loc)
         return type(self)._simple_new(res_values, name=self.name)
 
     def insert(self, loc: int, item) -> Index:
@@ -6346,7 +6351,18 @@ def insert(self, loc: int, item) -> Index:
             dtype = self._find_common_type_compat(item)
             return self.astype(dtype).insert(loc, item)
 
-        arr = np.asarray(self)
+        arr = self._values
+        if isinstance(arr, ExtensionArray):
+            # TODO: need EA.insert
+            try:
+                arr2 = type(arr)._from_sequence([item], dtype=arr.dtype)
+            except TypeError:
+                # TODO: make this into _validate_fill_value
+                dtype = self._find_common_type_compat(item)
+                return self.astype(dtype).insert(loc, item)
+
+            res_values = arr._concat_same_type([arr[:loc], arr2, arr[loc:]])
+            return type(self)._simple_new(res_values, name=self.name)
 
         # Use constructor to ensure we get tuples cast correctly.
         # Use self._constructor instead of Index to retain NumericIndex GH#43921
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index bbb3cb3391dfa..31c1619fba4d8 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -964,7 +964,7 @@ def _validate_key(self, key, axis: int):
         # slice of labels (where start-end in labels)
         # slice of integers (only if in the labels)
         # boolean not in slice and with boolean index
-        if isinstance(key, bool) and not is_bool_dtype(self.obj.index):
+        if isinstance(key, bool) and not (is_bool_dtype(self.obj.index) or self.obj.index.dtype.name == "boolean"):
             raise KeyError(
                 f"{key}: boolean label can not be used without a boolean index"
             )
diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
index 47cd29c49000c..f88fe6fa34050 100644
--- a/pandas/tests/base/test_value_counts.py
+++ b/pandas/tests/base/test_value_counts.py
@@ -91,8 +91,9 @@ def test_value_counts_null(null_obj, index_or_series_obj):
 
     # can't use expected[null_obj] = 3 as
     # IntervalIndex doesn't allow assignment
-    new_entry = Series({np.nan: 3}, dtype=np.int64)
-    expected = expected.append(new_entry)
+    #new_entry = Series({np.nan: 3}, dtype=np.int64)
+    #expected = expected.append(new_entry)  # TODO: test that both of these work with IntegerNAIndex
+    expected[null_obj] = 3
 
     result = obj.value_counts(dropna=False)
     if obj.duplicated().any():
diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py
index 9d5380955acae..0f4ffccd8ad7f 100644
--- a/pandas/tests/strings/test_extract.py
+++ b/pandas/tests/strings/test_extract.py
@@ -257,9 +257,7 @@ def test_extract_expand_True_single_capture_group(index_or_series, any_string_dt
     # single group renames series/index properly
     s_or_idx = index_or_series(["A1", "A2"], dtype=any_string_dtype)
     result = s_or_idx.str.extract(r"(?P<uno>A)\d", expand=True)
-    expected_dtype = "object" if index_or_series is Index else any_string_dtype
-    expected_dtype = any_string_dtype
-    expected = DataFrame({"uno": ["A", "A"]}, dtype=expected_dtype)
+    expected = DataFrame({"uno": ["A", "A"]}, dtype=any_string_dtype)
     tm.assert_frame_equal(result, expected)
 
 

From 1ed588a7acb7bbb9f23186a5f6fbce737c2eb941 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 11 Oct 2021 16:13:19 -0700
Subject: [PATCH 04/57] fix test

---
 pandas/tests/arithmetic/test_numeric.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py
index 8f97ed8774264..9cd56b66ca47e 100644
--- a/pandas/tests/arithmetic/test_numeric.py
+++ b/pandas/tests/arithmetic/test_numeric.py
@@ -1402,14 +1402,23 @@ def test_integer_array_add_list_like(
     left = container + box_1d_array(data)
     right = box_1d_array(data) + container
 
-    if Series == box_pandas_1d_array:
-        expected = Series(expected_data, dtype="Int64")
-    elif Series == box_1d_array:
-        expected = Series(expected_data, dtype="Int64")
-    elif Index in (box_pandas_1d_array, box_1d_array):
-        expected = Index(expected_data, dtype="Int64")
+    if Series in [box_1d_array, box_pandas_1d_array]:
+        cls = Series
+    elif Index in [box_1d_array, box_pandas_1d_array]:
+        cls = Index
     else:
-        expected = np.array(expected_data, dtype="object")
+        cls = np.array
+
+    if box_pandas_1d_array in [Index, Series]:
+        expected = cls(expected_data, dtype="Int64")
+
+    elif box_1d_array == Index:
+        # tm.to_array casts to object, Index constructor does inference
+        expected = cls(expected_data, dtype="int64")
+
+    else:
+        # tm.to_array casts to object, no inference
+        expected = cls(expected_data, dtype="object")
 
     tm.assert_equal(left, expected)
     tm.assert_equal(right, expected)

From 34d5dde0895e12ad723bd9b962cb9960941de4dc Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 11 Oct 2021 16:56:57 -0700
Subject: [PATCH 05/57] down to 38 tests failing

---
 pandas/_libs/index.pyx                        | 38 +++++++++++++++++--
 pandas/_libs/lib.pxd                          |  4 ++
 pandas/_libs/lib.pyx                          | 20 ++++++++++
 pandas/_testing/asserters.py                  | 10 ++++-
 pandas/conftest.py                            | 10 ++---
 pandas/core/arrays/floating.py                |  5 ++-
 pandas/core/arrays/masked.py                  |  3 ++
 pandas/core/indexes/base.py                   |  4 ++
 pandas/tests/arrays/floating/test_function.py | 20 ++++++----
 pandas/tests/indexes/common.py                |  9 ++++-
 10 files changed, 103 insertions(+), 20 deletions(-)

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index fc6de84631d72..4fa371f116fc1 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -32,6 +32,7 @@ from pandas._libs import (
     algos,
     hashtable as _hash,
 )
+from pandas._libs.lib cimport eq_NA_compat
 
 from pandas._libs.missing cimport (
     C_NA as NA,
@@ -63,7 +64,7 @@ cdef ndarray _get_bool_indexer(ndarray values, object val):
     if values.descr.type_num == cnp.NPY_OBJECT:
         # i.e. values.dtype == object
         if not checknull(val):
-            indexer = values == val
+            indexer = eq_NA_compat(values, val)
 
         else:
             # We need to check for _matching_ NA values
@@ -910,7 +911,7 @@ cdef class ExtensionEngine:
 
     cdef _get_bool_indexer(self, val):
         if checknull(val):
-            return self.values.isna()  # FIXME: need to check for *matching* NA
+            return self.values.isna().view("uint8")
 
         return self.values == val
 
@@ -946,6 +947,8 @@ cdef class ExtensionEngine:
                 # Because we are unique, loc should always be an integer
             except KeyError:
                 loc = -1
+            else:
+                assert util.is_integer_object(loc), (loc, val)
             res[i] = loc
 
         return res
@@ -1092,6 +1095,13 @@ cdef class NullableEngine:
         if is_definitely_invalid_key(val):
             raise TypeError(f"'{val}' is an invalid key")
 
+        if val is NA:
+            # TODO: return copy? readonly view?
+            # TODO: do this later on to keep same pattern as IndexEngine?
+            if not self.has_missing:
+                raise KeyError(val)
+            return _unpack_bool_indexer(self._mask, val)
+
         self._check_type(val)
 
         if self.over_size_threshold and self.is_monotonic_increasing:
@@ -1149,7 +1159,10 @@ cdef class NullableEngine:
 
     cdef _get_bool_indexer(self, val):
         if val is NA:
-            return self._mask
+            #if not self.has_missing:
+            #    raise KeyError(val)
+            # TODO: readonly? copy?
+            return self._mask.view("uint8")
 
         if util.is_nan(val):
             res = np.isnan(self._values)
@@ -1175,7 +1188,21 @@ cdef class NullableEngine:
         return self.sizeof()
 
     cdef _check_type(self, object val):
-        hash(val)
+        kind = self._values.dtype.kind
+        if kind in ["i", "u"]:
+            if not util.is_integer_object(val):
+                raise KeyError(val)
+            if kind == "u":
+                if val < 0:
+                    # cannot have negative values with unsigned int dtype
+                    raise KeyError(val)
+        elif kind == "b":
+            if not util.is_bool_object(val):
+                raise KeyError(val)
+        else:
+            if not util.is_integer_object(val) and not util.is_float_object(val):
+                # in particular catch bool and avoid casting True -> 1.0
+                raise KeyError(val)
 
     def get_indexer(self, values: "MaskedArray") -> np.ndarray:
         # Note: we only get here with self.is_unique
@@ -1191,6 +1218,9 @@ cdef class NullableEngine:
                 # Because we are unique, loc should always be an integer
             except KeyError:
                 loc = -1
+            else:
+                assert util.is_integer_object(loc), (loc, val)
+
             res[i] = loc
 
         return res
diff --git a/pandas/_libs/lib.pxd b/pandas/_libs/lib.pxd
index b3c72c30a74de..1306960b403e2 100644
--- a/pandas/_libs/lib.pxd
+++ b/pandas/_libs/lib.pxd
@@ -1 +1,5 @@
+from numpy cimport ndarray
+
 cdef bint c_is_list_like(object, bint) except -1
+
+cpdef ndarray eq_NA_compat(ndarray[object] arr, object key)
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index e7f889ef39707..e1b821cd8c27f 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -3028,3 +3028,23 @@ def is_bool_list(obj: list) -> bool:
 
     # Note: we return True for empty list
     return True
+
+
+cpdef ndarray eq_NA_compat(ndarray[object] arr, object key):
+    cdef:
+        ndarray[uint8_t, cast=True] result = np.empty(len(arr), dtype=bool)
+        Py_ssize_t i
+        object item
+
+    if key is C_NA:
+        for i in range(len(arr)):
+            item = arr[i]
+            result[i] = item is C_NA
+    else:
+        for i in range(len(arr)):
+            item = arr[i]
+            if item is C_NA:
+                result[i] = False
+            else:
+                result[i] = item == key  # FIXME: compat for other NAs
+    return result
diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
index fc7e36dda4619..f95de4a206682 100644
--- a/pandas/_testing/asserters.py
+++ b/pandas/_testing/asserters.py
@@ -400,8 +400,16 @@ def _get_ilevel_values(index, level):
     # skip exact index checking when `check_categorical` is False
     if check_exact and check_categorical:
         if not left.equals(right):
+            mismatch = left._values != right._values
+
+            if not isinstance(mismatch, np.ndarray):
+                # i.e. its a MaskedArray
+                mismatch = mismatch.to_numpy(dtype=int, na_value=0)
+                mismask = left._values._mask ^ right._values._mask
+                mismatch[mismask] = 1
+
             diff = (
-                np.sum((left._values != right._values).astype(int)) * 100.0 / len(left)
+                np.sum(mismatch.astype(int)) * 100.0 / len(left)
             )
             msg = f"{obj} values are different ({np.round(diff, 5)} %)"
             raise_assert_detail(obj, msg, left, right)
diff --git a/pandas/conftest.py b/pandas/conftest.py
index 3428c6a65da79..ab53628439011 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -501,11 +501,11 @@ def _create_mi_with_dt64tz_level():
     "nullable_int": Index(np.arange(100), dtype="Int64"),
     "nullable_float": Index(np.arange(100), dtype="Float32"),
     "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"),
-    "nullable_int-na": Index(np.arange(100), dtype="Int64").insert(1, pd.NA),
-    "nullable_float-na": Index(np.arange(100), dtype="Float32").insert(1, pd.NA),
-    "nullable_bool-na": Index(np.arange(100).astype(bool), dtype="boolean").insert(
-        1, pd.NA
-    ),
+    #"nullable_int-na": Index(np.arange(100), dtype="Int64").insert(1, pd.NA),
+    #"nullable_float-na": Index(np.arange(100), dtype="Float32").insert(1, pd.NA),
+    #"nullable_bool-na": Index(np.arange(100).astype(bool), dtype="boolean").insert(
+    #    1, pd.NA
+    #),
 }
 
 
diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py
index 066f6ebdfcaa6..a5cb8a419e4f5 100644
--- a/pandas/core/arrays/floating.py
+++ b/pandas/core/arrays/floating.py
@@ -108,6 +108,8 @@ def coerce_to_array(
     if dtype is None and hasattr(values, "dtype"):
         if is_float_dtype(values.dtype):
             dtype = values.dtype
+            if dtype == "float16":
+                raise TypeError("FloatingArray does not support float16 dtype")
 
     if dtype is not None:
         if isinstance(dtype, str) and dtype.startswith("Float"):
@@ -254,7 +256,8 @@ def dtype(self) -> FloatingDtype:
         return FLOAT_STR_TO_DTYPE[str(self._data.dtype)]
 
     def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
-        if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"):
+        if not (isinstance(values, np.ndarray) and values.dtype.kind == "f" and values.dtype.itemsize > 2):
+            # We do not support float16
             raise TypeError(
                 "values should be floating numpy array. Use "
                 "the 'pd.array' function instead"
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 3737719460431..dbd022eac2f00 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -412,6 +412,9 @@ def reconstruct(x):
                 m = mask.copy()
                 return IntegerArray(x, m)
             elif is_float_dtype(x.dtype):
+                if x.dtype.itemsize <= 2:
+                    # we don't support float16
+                    x = x.astype(np.float32)
                 m = mask.copy()
                 return FloatingArray(x, m)
             else:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 8586c9925fd7e..dd8c323cc4cfb 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -26,6 +26,7 @@
     algos as libalgos,
     index as libindex,
     lib,
+    missing as libmissing,
 )
 import pandas._libs.join as libjoin
 from pandas._libs.lib import (
@@ -821,6 +822,7 @@ def _engine(self) -> libindex.IndexEngine:
         ):
             return libindex.ExtensionEngine(self._values)
 
+        assert self.dtype != "boolean"
         # to avoid a reference cycle, bind `target_values` to a local variable, so
         # `self` is not passed into the lambda.
         target_values = self._get_engine_target()
@@ -3587,6 +3589,7 @@ def get_indexer(
 
             indexer = self._engine.get_indexer(target.codes)
             if self.hasnans and target.hasnans:
+                #loc = self.get_loc(libmissing.NA)
                 loc = self.get_loc(np.nan)
                 mask = target.isna()
                 indexer[mask] = loc
@@ -3605,6 +3608,7 @@ def get_indexer(
                 # Exclude MultiIndex because hasnans raises NotImplementedError
                 # we should only get here if we are unique, so loc is an integer
                 # GH#41934
+                #loc = self.get_loc(libmissing.NA)
                 loc = self.get_loc(np.nan)
                 mask = target.isna()
                 indexer[mask] = loc
diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py
index ef95eac316397..f96cbb8366564 100644
--- a/pandas/tests/arrays/floating/test_function.py
+++ b/pandas/tests/arrays/floating/test_function.py
@@ -97,26 +97,30 @@ def test_stat_method(pandasmethname, kwargs):
 def test_value_counts_na():
     arr = pd.array([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
     result = arr.value_counts(dropna=False)
-    expected = pd.Series([2, 1, 1], index=[0.1, 0.2, pd.NA], dtype="Int64")
+    idx = pd.Index([0.1, 0.2, pd.NA], dtype=arr.dtype)
+    assert idx.dtype == arr.dtype
+    expected = pd.Series([2, 1, 1], index=idx, dtype="Int64")
     tm.assert_series_equal(result, expected)
 
     result = arr.value_counts(dropna=True)
-    expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Int64")
+    expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64")
     tm.assert_series_equal(result, expected)
 
 
 def test_value_counts_empty():
-    s = pd.Series([], dtype="Float64")
-    result = s.value_counts()
-    idx = pd.Index([], dtype="object")
+    ser = pd.Series([], dtype="Float64")
+    result = ser.value_counts()
+    idx = pd.Index([], dtype="Float64")
+    assert idx.dtype == "Float64"
     expected = pd.Series([], index=idx, dtype="Int64")
     tm.assert_series_equal(result, expected)
 
 
 def test_value_counts_with_normalize():
-    s = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
-    result = s.value_counts(normalize=True)
-    expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Float64") / 3
+    ser = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
+    result = ser.value_counts(normalize=True)
+    expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3
+    assert expected.index.dtype == ser.dtype
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index 38127a0e255bd..5932c0ab878b8 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -296,6 +296,12 @@ def test_ensure_copied_data(self, index):
         elif isinstance(index, IntervalIndex):
             # checked in test_interval.py
             pass
+        elif type(index) is Index and not isinstance(index.dtype, np.dtype):
+            result = index_type(index.values, copy=False, **init_kwargs)
+            # FIXME: this is specific to MaskedArray
+            tm.assert_numpy_array_equal(index._values._data, result._values._data, check_same="same")
+            tm.assert_numpy_array_equal(index._values._mask, result._values._mask, check_same="same")
+
         else:
             result = index_type(index.values, copy=False, **init_kwargs)
             tm.assert_numpy_array_equal(index.values, result.values, check_same="same")
@@ -315,7 +321,8 @@ def test_memory_usage(self, index):
 
         # RangeIndex, IntervalIndex
         # don't have engines
-        if not isinstance(index, (RangeIndex, IntervalIndex)):
+        # Index[EA] has engine but it does not have a Hashtable .mapping
+        if not isinstance(index, (RangeIndex, IntervalIndex)) and not (type(index) is Index and not isinstance(index.dtype, np.dtype)):
             assert result2 > result
 
         if index.inferred_type == "object":

From 544d9fea6ef8d30383e671880459681c4708fc57 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 12 Oct 2021 19:14:45 -0700
Subject: [PATCH 06/57] down to 15 tests failing

---
 pandas/_libs/testing.pyx                     | 16 ++++++++++----
 pandas/core/arrays/floating.py               |  2 ++
 pandas/core/indexes/base.py                  |  2 +-
 pandas/tests/arrays/boolean/test_function.py | 13 +++++++-----
 pandas/tests/arrays/integer/test_function.py | 22 ++++++++++++--------
 pandas/tests/indexes/common.py               |  4 +++-
 pandas/tests/indexes/test_common.py          |  4 +++-
 7 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx
index cfe9f40f12452..90c10be990ac2 100644
--- a/pandas/_libs/testing.pyx
+++ b/pandas/_libs/testing.pyx
@@ -13,6 +13,7 @@ from pandas._libs.util cimport (
     is_real_number_object,
 )
 
+from pandas._libs.missing cimport is_matching_na
 from pandas.core.dtypes.common import is_dtype_equal
 from pandas.core.dtypes.missing import (
     array_equivalent,
@@ -174,11 +175,18 @@ cpdef assert_almost_equal(a, b,
         # classes can't be the same, to raise error
         assert_class_equal(a, b, obj=obj)
 
-    if isna(a) and isna(b):
-        # TODO: Should require same-dtype NA?
-        # nan / None comparison
-        return True
+    if isna(a):
+        if isna(b):
+            # TODO: Should require same-dtype NA?
+            # nan / None comparison
+            return True
+
+        assert False, f"expected {a} but got {b}"
+
+    elif isna(b):
+        assert False, f"expected {a} but got {b}"
 
+    # TODO: test for tm.assert_whatever with pd.NA that would raise here
     if a == b:
         # object comparison
         return True
diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py
index a5cb8a419e4f5..1e2864121c92e 100644
--- a/pandas/core/arrays/floating.py
+++ b/pandas/core/arrays/floating.py
@@ -425,6 +425,8 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
 
         return type(self)(result, mask, copy=False)
 
+    def isna(self):
+        return self._mask | np.isnan(self._data)
 
 _dtype_docstring = """
 An ExtensionDtype for {dtype} data.
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 832e633da07d9..57596024a8afd 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -358,7 +358,7 @@ def _outer_indexer(
 
     _typ: str = "index"
     _data: ExtensionArray | np.ndarray
-    _data_cls: type[np.ndarray] | type[ExtensionArray] = np.ndarray
+    _data_cls: type[np.ndarray] | type[ExtensionArray] = (np.ndarray, ExtensionArray)
     _id: object | None = None
     _name: Hashable = None
     # MultiIndex.levels previously allowed setting the index name. We
diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py
index d90655b6e2820..6871fbb1e29d2 100644
--- a/pandas/tests/arrays/boolean/test_function.py
+++ b/pandas/tests/arrays/boolean/test_function.py
@@ -77,18 +77,21 @@ def test_ufunc_reduce_raises(values):
 def test_value_counts_na():
     arr = pd.array([True, False, pd.NA], dtype="boolean")
     result = arr.value_counts(dropna=False)
-    expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64")
+    expected = pd.Series([1, 1, 1], index=arr, dtype="Int64")
+    assert expected.index.dtype == arr.dtype
     tm.assert_series_equal(result, expected)
 
     result = arr.value_counts(dropna=True)
-    expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
+    expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64")
+    assert expected.index.dtype == arr.dtype
     tm.assert_series_equal(result, expected)
 
 
 def test_value_counts_with_normalize():
-    s = pd.Series([True, False, pd.NA], dtype="boolean")
-    result = s.value_counts(normalize=True)
-    expected = pd.Series([1, 1], index=[True, False], dtype="Float64") / 2
+    ser = pd.Series([True, False, pd.NA], dtype="boolean")
+    result = ser.value_counts(normalize=True)
+    expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64") / 2
+    assert expected.index.dtype == "boolean"
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py
index 6f53b44776900..306bac96da3d0 100644
--- a/pandas/tests/arrays/integer/test_function.py
+++ b/pandas/tests/arrays/integer/test_function.py
@@ -108,29 +108,33 @@ def test_stat_method(pandasmethname, kwargs):
 def test_value_counts_na():
     arr = pd.array([1, 2, 1, pd.NA], dtype="Int64")
     result = arr.value_counts(dropna=False)
-    expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64")
+    ex_index = pd.Index([1, 2, pd.NA], dtype="Int64")
+    assert ex_index.dtype == "Int64"
+    expected = pd.Series([2, 1, 1], index=ex_index, dtype="Int64")
     tm.assert_series_equal(result, expected)
 
     result = arr.value_counts(dropna=True)
-    expected = pd.Series([2, 1], index=[1, 2], dtype="Int64")
+    expected = pd.Series([2, 1], index=arr[:2], dtype="Int64")
+    assert expected.index.dtype == arr.dtype
     tm.assert_series_equal(result, expected)
 
 
 def test_value_counts_empty():
     # https://github.com/pandas-dev/pandas/issues/33317
-    s = pd.Series([], dtype="Int64")
-    result = s.value_counts()
-    # TODO: The dtype of the index seems wrong (it's int64 for non-empty)
-    idx = pd.Index([], dtype="object")
+    ser = pd.Series([], dtype="Int64")
+    result = ser.value_counts()
+    idx = pd.Index([], dtype=ser.dtype)
+    assert idx.dtype == ser.dtype
     expected = pd.Series([], index=idx, dtype="Int64")
     tm.assert_series_equal(result, expected)
 
 
 def test_value_counts_with_normalize():
     # GH 33172
-    s = pd.Series([1, 2, 1, pd.NA], dtype="Int64")
-    result = s.value_counts(normalize=True)
-    expected = pd.Series([2, 1], index=[1, 2], dtype="Float64") / 3
+    ser = pd.Series([1, 2, 1, pd.NA], dtype="Int64")
+    result = ser.value_counts(normalize=True)
+    expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3
+    assert expected.index.dtype == ser.dtype
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index 6de8bda1fde61..cd3bf547491f0 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -443,7 +443,9 @@ def test_equals(self, index):
 
         assert index.equals(index)
         assert index.equals(index.copy())
-        assert index.equals(index.astype(object))
+        if not (type(index) is Index and not isinstance(index.dtype, np.dtype)):
+            # doesn't hold for e.g. IntegerDtype
+            assert index.equals(index.astype(object))
 
         assert not index.equals(list(index))
         assert not index.equals(np.array(index))
diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py
index 604b68cfcc791..4e899112dd048 100644
--- a/pandas/tests/indexes/test_common.py
+++ b/pandas/tests/indexes/test_common.py
@@ -422,7 +422,9 @@ def test_sort_values_with_missing(index_with_missing, na_position):
         sorted_values = np.concatenate([[None] * missing_count, sorted_values])
     else:
         sorted_values = np.concatenate([sorted_values, [None] * missing_count])
-    expected = type(index_with_missing)(sorted_values)
+
+    # Explicitly pass dtype needed for Index backed by EA e.g. IntegerArray
+    expected = type(index_with_missing)(sorted_values, dtype=index_with_missing.dtype)
 
     result = index_with_missing.sort_values(na_position=na_position)
     tm.assert_index_equal(result, expected)

From 900978c4e50a69b457205240fcb730a6222d3778 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 15 Oct 2021 21:27:07 -0700
Subject: [PATCH 07/57] fix value_counts

---
 pandas/conftest.py           |  1 +
 pandas/core/algorithms.py    |  7 +++++--
 pandas/core/arrays/masked.py |  2 +-
 pandas/core/dtypes/concat.py |  1 +
 pandas/core/indexes/base.py  | 15 ++++++++++++++-
 5 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index ab53628439011..9d26c2749b77e 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -499,6 +499,7 @@ def _create_mi_with_dt64tz_level():
     "multi": _create_multiindex(),
     "repeats": Index([0, 0, 1, 1, 2, 2]),
     "nullable_int": Index(np.arange(100), dtype="Int64"),
+    "nullable_uint": Index(np.arange(100), dtype="UInt16"),
     "nullable_float": Index(np.arange(100), dtype="Float32"),
     "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"),
     #"nullable_int-na": Index(np.arange(100), dtype="Int64").insert(1, pd.NA),
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 10a5932731e3b..9420745f2a284 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1830,12 +1830,15 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike:
     np.ndarray or ExtensionArray
         Containing the unsorted union of both arrays.
     """
+    from pandas.core.dtypes.concat import concat_compat
+
     indexer = []
     l_count = value_counts(lvals, dropna=False)
     r_count = value_counts(rvals, dropna=False)
     l_count, r_count = l_count.align(r_count, fill_value=0)
-    unique_array = unique(np.append(lvals, rvals))
-    if not isinstance(lvals, np.ndarray):
+    unique_array = unique(concat_compat([lvals, rvals]))
+    unique_array = ensure_wrapped_if_datetimelike(unique_array)
+    if False:#not isinstance(lvals, np.ndarray):
         # i.e. ExtensionArray
         # Note: we only get here with lvals.dtype == rvals.dtype
         # TODO: are there any cases where union won't be type/dtype preserving?
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index dbd022eac2f00..6cef5c5259dac 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -577,7 +577,7 @@ def value_counts(self, dropna: bool = True) -> Series:
             counts[:-1] = value_counts
             counts[-1] = self._mask.sum()
 
-            index = index.insert(-1, self.dtype.na_value)
+            index = index.insert(len(index), self.dtype.na_value)
             # index = Index(
             #    np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]),
             #    dtype=object,
diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
index c7fce9fff3631..23b76983c6789 100644
--- a/pandas/core/dtypes/concat.py
+++ b/pandas/core/dtypes/concat.py
@@ -22,6 +22,7 @@
     ABCSeries,
 )
 
+# TODO: avoid these imports so we can import from this file in core.algorithms
 from pandas.core.arrays import ExtensionArray
 from pandas.core.arrays.sparse import SparseArray
 from pandas.core.construction import (
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 57596024a8afd..45f399f4631f4 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -5889,7 +5889,20 @@ def map(self, mapper, na_action=None):
                 new_values, dtype=dtype, copy=False, name=self.name
             )
 
-        return Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name)
+        result = Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name)
+
+        if type(self) is Index and not isinstance(self.dtype, np.dtype):
+            # TODO: what about "integer-na"
+            if self.dtype.kind in ["i", "u"] and result.inferred_type == "integer":
+                # TODO: worry about itemsize/overflows?
+                result = result.astype(self.dtype, copy=False)
+            elif self.dtype.kind == "f" and result.inferred_type == "floating":
+                # TODO: worry about itemsize/overflows?
+                result = result.astype(self.dtype, copy=False)
+            elif self.dtype == "boolean" and result.inferred_type == "boolean":
+                result = result.astype(self.dtype, copy=False)
+
+        return result
 
     # TODO: De-duplicate with map, xref GH#32349
     @final

From c0ae18c808ee8c75b25da00da036089e82a55a51 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 18 Oct 2021 08:34:08 -0700
Subject: [PATCH 08/57] fix map test

---
 pandas/tests/indexes/test_base.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index cbcb00a4230cc..9abf1abbc4365 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -723,6 +723,9 @@ def test_map_dictlike(self, index, mapper):
             else:
                 exp_dtype = np.int64
             expected = index._constructor(np.arange(len(index), 0, -1), dtype=exp_dtype)
+        elif type(index) is Index and index.dtype != object:
+            # i.e. EA-backed, for now just Nullable
+            expected = Index(np.arange(len(index), 0, -1), dtype=index.dtype)
         else:
             expected = Index(np.arange(len(index), 0, -1))
 

From a9ef37ec1c43cb95238d31dc26a5d01a3b136ee3 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 21 Oct 2021 12:23:53 -0700
Subject: [PATCH 09/57] fix some tests

---
 pandas/_libs/index.pyx                     | 12 +++++++----
 pandas/_libs/testing.pyx                   | 23 +++++++++++-----------
 pandas/conftest.py                         |  9 ++++-----
 pandas/core/algorithms.py                  |  6 +-----
 pandas/core/arrays/floating.py             |  7 ++++++-
 pandas/core/arrays/masked.py               |  1 +
 pandas/core/arrays/string_.py              |  4 +++-
 pandas/core/arrays/string_arrow.py         |  3 +--
 pandas/core/indexes/base.py                | 21 ++++++++++++++------
 pandas/tests/arrays/string_/test_string.py | 10 +++++-----
 pandas/tests/indexes/test_any_index.py     | 13 +++++++++---
 11 files changed, 66 insertions(+), 43 deletions(-)

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index 9914800f5a87a..9bd93c50ef40c 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -32,8 +32,8 @@ from pandas._libs import (
     algos,
     hashtable as _hash,
 )
-from pandas._libs.lib cimport eq_NA_compat
 
+from pandas._libs.lib cimport eq_NA_compat
 from pandas._libs.missing cimport (
     C_NA as NA,
     checknull,
@@ -934,18 +934,22 @@ cdef class ExtensionEngine:
 
         return self._maybe_get_bool_indexer(val)
 
-    cdef _get_bool_indexer(self, val):
+    cdef ndarray _get_bool_indexer(self, val):
         if checknull(val):
             return self.values.isna().view("uint8")
 
-        return self.values == val
+        try:
+            return self.values == val
+        except TypeError:
+            # e.g. if __eq__ returns a BooleanArray instead of ndarry[bool]
+            return (self.values == val).to_numpy(dtype=bool, na_value=False)
 
     cdef _maybe_get_bool_indexer(self, object val):
         # Returns ndarray[bool] or int
         cdef:
             ndarray[uint8_t, ndim=1, cast=True] indexer
 
-        indexer = _get_bool_indexer(self.values, val)
+        indexer = self._get_bool_indexer(val)
         return _unpack_bool_indexer(indexer, val)
 
     def sizeof(self, deep: bool = False) -> int:
diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx
index 90c10be990ac2..d2566123815f1 100644
--- a/pandas/_libs/testing.pyx
+++ b/pandas/_libs/testing.pyx
@@ -7,13 +7,13 @@ from numpy cimport import_array
 
 import_array()
 
+from pandas._libs.missing cimport is_matching_na
 from pandas._libs.util cimport (
     is_array,
     is_complex_object,
     is_real_number_object,
 )
 
-from pandas._libs.missing cimport is_matching_na
 from pandas.core.dtypes.common import is_dtype_equal
 from pandas.core.dtypes.missing import (
     array_equivalent,
@@ -175,16 +175,17 @@ cpdef assert_almost_equal(a, b,
         # classes can't be the same, to raise error
         assert_class_equal(a, b, obj=obj)
 
-    if isna(a):
-        if isna(b):
-            # TODO: Should require same-dtype NA?
-            # nan / None comparison
-            return True
-
-        assert False, f"expected {a} but got {b}"
-
-    elif isna(b):
-        assert False, f"expected {a} but got {b}"
+    if isna(a) and isna(b):
+        return True
+        #if isna(b):
+        #    # TODO: Should require same-dtype NA?
+        #    # nan / None comparison
+        #    return True
+        #
+        #assert False, f"expected {a} but got {b}"
+
+    #elif isna(b):
+    #    assert False, f"expected {a} but got {b}"
 
     # TODO: test for tm.assert_whatever with pd.NA that would raise here
     if a == b:
diff --git a/pandas/conftest.py b/pandas/conftest.py
index c66f386015685..ef48dc0055615 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -511,15 +511,14 @@ def _create_mi_with_dt64tz_level():
     "mi-with-dt64tz-level": _create_mi_with_dt64tz_level(),
     "multi": _create_multiindex(),
     "repeats": Index([0, 0, 1, 1, 2, 2]),
+    # TODO: make sure we have cases that also have NA values
+    #  (not allowed in this fixture)
     "nullable_int": Index(np.arange(100), dtype="Int64"),
     "nullable_uint": Index(np.arange(100), dtype="UInt16"),
     "nullable_float": Index(np.arange(100), dtype="Float32"),
     "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"),
-    #"nullable_int-na": Index(np.arange(100), dtype="Int64").insert(1, pd.NA),
-    #"nullable_float-na": Index(np.arange(100), dtype="Float32").insert(1, pd.NA),
-    #"nullable_bool-na": Index(np.arange(100).astype(bool), dtype="boolean").insert(
-    #    1, pd.NA
-    #),
+    "string-python": Index(pd.array(tm.makeStringIndex(100), dtype="string[python]")),
+    "string-pyarrow": Index(pd.array(tm.makeStringIndex(100), dtype="string[pyarrow]")),
 }
 
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 45ea5fc2716e4..3740667a2efff 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1843,11 +1843,7 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike:
     l_count, r_count = l_count.align(r_count, fill_value=0)
     unique_array = unique(concat_compat([lvals, rvals]))
     unique_array = ensure_wrapped_if_datetimelike(unique_array)
-    if False:#not isinstance(lvals, np.ndarray):
-        # i.e. ExtensionArray
-        # Note: we only get here with lvals.dtype == rvals.dtype
-        # TODO: are there any cases where union won't be type/dtype preserving?
-        unique_array = type(lvals)._from_sequence(unique_array, dtype=lvals.dtype)
+
     for i, value in enumerate(unique_array):
         indexer += [i] * int(max(l_count[value], r_count[value]))
     return unique_array.take(indexer)
diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py
index 9e03489d4343e..1dee05954d8d5 100644
--- a/pandas/core/arrays/floating.py
+++ b/pandas/core/arrays/floating.py
@@ -256,7 +256,11 @@ def dtype(self) -> FloatingDtype:
         return FLOAT_STR_TO_DTYPE[str(self._data.dtype)]
 
     def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
-        if not (isinstance(values, np.ndarray) and values.dtype.kind == "f" and values.dtype.itemsize > 2):
+        if not (
+            isinstance(values, np.ndarray)
+            and values.dtype.kind == "f"
+            and values.dtype.itemsize > 2
+        ):
             # We do not support float16
             raise TypeError(
                 "values should be floating numpy array. Use "
@@ -428,6 +432,7 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
     def isna(self):
         return self._mask | np.isnan(self._data)
 
+
 _dtype_docstring = """
 An ExtensionDtype for {dtype} data.
 
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 877a5f24367e4..51429a4fd0418 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -453,6 +453,7 @@ def reconstruct(x):
                 return IntegerArray(x, m)
             elif is_float_dtype(x.dtype):
                 if x.dtype.itemsize <= 2:
+                    # reached in e.g. np.sqrt on BooleanArray
                     # we don't support float16
                     x = x.astype(np.float32)
                 m = mask.copy()
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index d93fa4bbdd7fc..00281bc8a7101 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -474,7 +474,9 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
     def value_counts(self, dropna: bool = True):
         from pandas import value_counts
 
-        return value_counts(self._ndarray, dropna=dropna).astype("Int64")
+        result = value_counts(self._ndarray, dropna=dropna).astype("Int64")
+        result.index = result.index.astype(self.dtype)
+        return result
 
     def memory_usage(self, deep: bool = False) -> int:
         result = self._ndarray.nbytes
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index c7d08f7873c09..ccd3e063c48a6 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -668,8 +668,7 @@ def value_counts(self, dropna: bool = True) -> Series:
         # No missing values so we can adhere to the interface and return a numpy array.
         counts = np.array(counts)
 
-        # Index cannot hold ExtensionArrays yet
-        index = Index(type(self)(values)).astype(object)
+        index = Index(type(self)(values))
 
         return Series(counts, index=index).astype("Int64")
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index b6f0ef937f8ba..cb426a2e408c2 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -69,6 +69,7 @@
     can_hold_element,
     find_common_type,
     infer_dtype_from,
+    maybe_cast_pointwise_result,
     validate_numeric_casting,
 )
 from pandas.core.dtypes.common import (
@@ -1082,6 +1083,7 @@ def take(
                 values, indices, allow_fill=allow_fill, fill_value=self._na_value
             )
         else:
+            # algos.take passes 'axis' keyword which not all EAs accept
             taken = values.take(
                 indices, allow_fill=allow_fill, fill_value=self._na_value
             )
@@ -2576,8 +2578,12 @@ def __reduce__(self):
     # --------------------------------------------------------------------
     # Null Handling Methods
 
-    _na_value: float | NaTType = np.nan
-    """The expected NA value to use with this index."""
+    @cache_readonly
+    def _na_value(self):
+        """The expected NA value to use with this index."""
+        if isinstance(self.dtype, np.dtype):
+            return np.nan
+        return self.dtype.na_value
 
     @cache_readonly
     def _isnan(self) -> npt.NDArray[np.bool_]:
@@ -3634,7 +3640,7 @@ def get_indexer(
 
             indexer = self._engine.get_indexer(target.codes)
             if self.hasnans and target.hasnans:
-                #loc = self.get_loc(libmissing.NA)
+                # loc = self.get_loc(libmissing.NA)
                 loc = self.get_loc(np.nan)
                 mask = target.isna()
                 indexer[mask] = loc
@@ -3653,7 +3659,7 @@ def get_indexer(
                 # Exclude MultiIndex because hasnans raises NotImplementedError
                 # we should only get here if we are unique, so loc is an integer
                 # GH#41934
-                #loc = self.get_loc(libmissing.NA)
+                # loc = self.get_loc(libmissing.NA)
                 loc = self.get_loc(np.nan)
                 mask = target.isna()
                 indexer[mask] = loc
@@ -5981,9 +5987,12 @@ def map(self, mapper, na_action=None):
                 new_values, dtype=dtype, copy=False, name=self.name
             )
 
-        result = Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name)
+        res_values = maybe_cast_pointwise_result(
+            new_values, self.dtype, same_dtype=True
+        )
+        result = Index._with_infer(res_values, dtype=dtype, copy=False, name=self.name)
 
-        if type(self) is Index and not isinstance(self.dtype, np.dtype):
+        if False:  # type(self) is Index and not isinstance(self.dtype, np.dtype):
             # TODO: what about "integer-na"
             if self.dtype.kind in ["i", "u"] and result.inferred_type == "integer":
                 # TODO: worry about itemsize/overflows?
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index fa564ac76f8bb..e540ed248a57e 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -466,18 +466,18 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2):
 def test_value_counts_na(dtype):
     arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
     result = arr.value_counts(dropna=False)
-    expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64")
+    expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype="Int64")
     tm.assert_series_equal(result, expected)
 
     result = arr.value_counts(dropna=True)
-    expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64")
+    expected = pd.Series([2, 1], index=arr[:2], dtype="Int64")
     tm.assert_series_equal(result, expected)
 
 
 def test_value_counts_with_normalize(dtype):
-    s = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
-    result = s.value_counts(normalize=True)
-    expected = pd.Series([2, 1], index=["a", "b"], dtype="Float64") / 3
+    ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
+    result = ser.value_counts(normalize=True)
+    expected = pd.Series([2, 1], index=ser[:2], dtype="Float64") / 3
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py
index f68bde2188e67..3f85cd59bff76 100644
--- a/pandas/tests/indexes/test_any_index.py
+++ b/pandas/tests/indexes/test_any_index.py
@@ -139,9 +139,16 @@ def test_slice_keeps_name(self, index):
     # FutureWarning from non-tuple sequence of nd indexing
     @pytest.mark.filterwarnings("ignore::FutureWarning")
     def test_getitem_error(self, index, item):
-        msg = r"index 101 is out of bounds for axis 0 with size [\d]+|" + re.escape(
-            "only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) "
-            "and integer or boolean arrays are valid indices"
+        msg = "|".join(
+            [
+                r"index 101 is out of bounds for axis 0 with size [\d]+",
+                re.escape(
+                    "only integers, slices (`:`), ellipsis (`...`), "
+                    "numpy.newaxis (`None`) and integer or boolean arrays "
+                    "are valid indices"
+                ),
+                "index out of bounds",  # string[pyarrow]
+            ]
         )
         with pytest.raises(IndexError, match=msg):
             index[item]

From 41acf3f9390d55413416b60218a4c62a509eb511 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 21 Oct 2021 19:00:32 -0700
Subject: [PATCH 10/57] ENH: ExtensionArray.insert

---
 doc/source/reference/extensions.rst    |  1 +
 pandas/core/arrays/_mixins.py          |  3 ++
 pandas/core/arrays/base.py             | 30 ++++++++++++++++++
 pandas/tests/extension/base/methods.py | 42 ++++++++++++++++++++++++++
 pandas/tests/extension/conftest.py     | 12 ++++++++
 pandas/tests/extension/test_numpy.py   | 12 ++++++++
 pandas/tests/extension/test_string.py  |  7 +++++
 pandas/util/_validators.py             | 23 +++++++++++++-
 8 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
index e2e8c94ef8fc6..ce8d8d5c2ca10 100644
--- a/doc/source/reference/extensions.rst
+++ b/doc/source/reference/extensions.rst
@@ -48,6 +48,7 @@ objects.
       api.extensions.ExtensionArray.equals
       api.extensions.ExtensionArray.factorize
       api.extensions.ExtensionArray.fillna
+      api.extensions.ExtensionArray.insert
       api.extensions.ExtensionArray.isin
       api.extensions.ExtensionArray.isna
       api.extensions.ExtensionArray.ravel
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
index 3769c686da029..cf9820c3aa8f8 100644
--- a/pandas/core/arrays/_mixins.py
+++ b/pandas/core/arrays/_mixins.py
@@ -31,6 +31,7 @@
 from pandas.util._validators import (
     validate_bool_kwarg,
     validate_fillna_kwargs,
+    validate_insert_loc,
 )
 
 from pandas.core.dtypes.common import is_dtype_equal
@@ -359,6 +360,8 @@ def insert(
         -------
         type(self)
         """
+        loc = validate_insert_loc(loc, len(self))
+
         code = self._validate_scalar(item)
 
         new_vals = np.concatenate(
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index bf54f7166e14d..9b25a1b5abccd 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -47,6 +47,7 @@
 from pandas.util._validators import (
     validate_bool_kwarg,
     validate_fillna_kwargs,
+    validate_insert_loc,
 )
 
 from pandas.core.dtypes.cast import maybe_cast_to_extension_array
@@ -123,6 +124,7 @@ class ExtensionArray:
     factorize
     fillna
     equals
+    insert
     isin
     isna
     ravel
@@ -1388,6 +1390,34 @@ def delete(self: ExtensionArrayT, loc: PositionalIndexer) -> ExtensionArrayT:
         indexer = np.delete(np.arange(len(self)), loc)
         return self.take(indexer)
 
+    def insert(self: ExtensionArrayT, loc: int, item) -> ExtensionArrayT:
+        """
+        Insert an item at the given position.
+
+        Parameters
+        ----------
+        loc : int
+        item : scalar-like
+
+        Returns
+        -------
+        same type as self
+
+        Notes
+        -----
+        This method should be both type and dtype-preserving.  If the item
+        cannot be held in an array of this type/dtype, either ValueError or
+        TypeError should be raised.
+
+        The default implementation relies on _from_sequence to raise on invalid
+        items.
+        """
+        loc = validate_insert_loc(loc, len(self))
+
+        item_arr = type(self)._from_sequence([item], dtype=self.dtype)
+
+        return type(self)._concat_same_type([self[:loc], item_arr, self[loc:]])
+
     @classmethod
     def _empty(cls, shape: Shape, dtype: ExtensionDtype):
         """
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
index d390d4b5d8143..c96e2fb49e397 100644
--- a/pandas/tests/extension/base/methods.py
+++ b/pandas/tests/extension/base/methods.py
@@ -511,6 +511,48 @@ def test_delete(self, data):
         expected = data._concat_same_type([data[[0]], data[[2]], data[4:]])
         self.assert_extension_array_equal(result, expected)
 
+    def test_insert(self, data):
+        # insert at the beginning
+        result = data[1:].insert(0, data[0])
+        self.assert_extension_array_equal(result, data)
+
+        result = data[1:].insert(-len(data[1:]), data[0])
+        self.assert_extension_array_equal(result, data)
+
+        # insert at the middle
+        result = data[:-1].insert(4, data[-1])
+
+        taker = np.arange(len(data))
+        taker[5:] = taker[4:-1]
+        taker[4] = len(data) - 1
+        expected = data.take(taker)
+        self.assert_extension_array_equal(result, expected)
+
+    def test_insert_invalid(self, data, invalid_scalar):
+        item = invalid_scalar
+
+        with pytest.raises((TypeError, ValueError)):
+            data.insert(0, item)
+
+        with pytest.raises((TypeError, ValueError)):
+            data.insert(4, item)
+
+        with pytest.raises((TypeError, ValueError)):
+            data.insert(len(data) - 1, item)
+
+    def test_insert_invalid_loc(self, data):
+        ub = len(data)
+
+        with pytest.raises(IndexError):
+            data.insert(ub + 1, data[0])
+
+        with pytest.raises(IndexError):
+            data.insert(-ub - 1, data[0])
+
+        with pytest.raises(TypeError):
+            # we expect TypeError here instead of IndexError to match np.insert
+            data.insert(1.5, data[0])
+
     @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame])
     def test_equals(self, data, na_value, as_series, box):
         data2 = type(data)._from_sequence([data[0]] * len(data), dtype=data.dtype)
diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py
index 1942d737780da..3827ba234cfd8 100644
--- a/pandas/tests/extension/conftest.py
+++ b/pandas/tests/extension/conftest.py
@@ -181,3 +181,15 @@ def as_array(request):
     Boolean fixture to support ExtensionDtype _from_sequence method testing.
     """
     return request.param
+
+
+@pytest.fixture
+def invalid_scalar(data):
+    """
+    A scalar that *cannot* be held by this ExtensionArray.
+
+    The default should work for most subclasses, but is not guaranteed.
+
+    If the array can hold any item (i.e. object dtype), then use pytest.skip.
+    """
+    return object.__new__(object)
diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py
index 7be776819e399..0e3e26e7e9500 100644
--- a/pandas/tests/extension/test_numpy.py
+++ b/pandas/tests/extension/test_numpy.py
@@ -265,6 +265,18 @@ def test_searchsorted(self, data_for_sorting, as_series):
     def test_diff(self, data, periods):
         return super().test_diff(data, periods)
 
+    def test_insert(self, data, request):
+        if data.dtype.numpy_dtype == object:
+            mark = pytest.mark.xfail(reason="Dimension mismatch in np.concatenate")
+            request.node.add_marker(mark)
+
+        super().test_insert(data)
+
+    @skip_nested
+    def test_insert_invalid(self, data, invalid_scalar):
+        # PandasArray[object] can hold anything, so skip
+        super().test_insert_invalid(data, invalid_scalar)
+
 
 class TestArithmetics(BaseNumPyTests, base.BaseArithmeticOpsTests):
     divmod_exc = None
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index af86c359c4c00..06b07968f949e 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -160,6 +160,13 @@ def test_value_counts(self, all_data, dropna):
     def test_value_counts_with_normalize(self, data):
         pass
 
+    def test_insert_invalid(self, data, invalid_scalar, request):
+        if data.dtype.storage == "pyarrow":
+            mark = pytest.mark.xfail(reason="casts invalid_scalar to string")
+            request.node.add_marker(mark)
+
+        super().test_insert_invalid(data, invalid_scalar)
+
 
 class TestCasting(base.BaseCastingTests):
     pass
diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py
index 7e03e3ceea11d..f8bd1ec7bc96a 100644
--- a/pandas/util/_validators.py
+++ b/pandas/util/_validators.py
@@ -12,7 +12,10 @@
 
 import numpy as np
 
-from pandas.core.dtypes.common import is_bool
+from pandas.core.dtypes.common import (
+    is_bool,
+    is_integer,
+)
 
 
 def _check_arg_length(fname, args, max_fname_arg_count, compat_args):
@@ -494,3 +497,21 @@ def validate_inclusive(inclusive: str | None) -> tuple[bool, bool]:
         )
 
     return left_right_inclusive
+
+
+def validate_insert_loc(loc: int, length: int) -> int:
+    """
+    Check that we have an integer between -length and length, inclusive.
+
+    Standardize negative loc to within [0, length].
+
+    The exceptions we raise on failure match np.insert.
+    """
+    if not is_integer(loc):
+        raise TypeError(f"loc must be an integer between -{length} and {length}")
+
+    if loc < 0:
+        loc += length
+    if not 0 <= loc <= length:
+        raise IndexError(f"loc must be an integer between -{length} and {length}")
+    return loc

From 37d36adfad243148f7d55f5dfa9218e7221faeb3 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 21 Oct 2021 19:43:20 -0700
Subject: [PATCH 11/57] Fix usage

---
 pandas/tests/arithmetic/test_interval.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pandas/tests/arithmetic/test_interval.py b/pandas/tests/arithmetic/test_interval.py
index 12220e825aed4..88b26dcc4d707 100644
--- a/pandas/tests/arithmetic/test_interval.py
+++ b/pandas/tests/arithmetic/test_interval.py
@@ -28,16 +28,16 @@
         (Index([0, 2, 4, 4]), Index([1, 3, 5, 8])),
         (Index([0.0, 1.0, 2.0, np.nan]), Index([1.0, 2.0, 3.0, np.nan])),
         (
-            timedelta_range("0 days", periods=3).insert(4, pd.NaT),
-            timedelta_range("1 day", periods=3).insert(4, pd.NaT),
+            timedelta_range("0 days", periods=3).insert(3, pd.NaT),
+            timedelta_range("1 day", periods=3).insert(3, pd.NaT),
         ),
         (
-            date_range("20170101", periods=3).insert(4, pd.NaT),
-            date_range("20170102", periods=3).insert(4, pd.NaT),
+            date_range("20170101", periods=3).insert(3, pd.NaT),
+            date_range("20170102", periods=3).insert(3, pd.NaT),
         ),
         (
-            date_range("20170101", periods=3, tz="US/Eastern").insert(4, pd.NaT),
-            date_range("20170102", periods=3, tz="US/Eastern").insert(4, pd.NaT),
+            date_range("20170101", periods=3, tz="US/Eastern").insert(3, pd.NaT),
+            date_range("20170102", periods=3, tz="US/Eastern").insert(3, pd.NaT),
         ),
     ],
     ids=lambda x: str(x[0].dtype),

From bafb23f57ec15a9ad0c1818ecfd428a4c7a935de Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 21 Oct 2021 22:05:25 -0700
Subject: [PATCH 12/57] Fix TimedeltaIndex.insert test

---
 pandas/tests/indexes/timedeltas/methods/test_insert.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/indexes/timedeltas/methods/test_insert.py b/pandas/tests/indexes/timedeltas/methods/test_insert.py
index 809d21db805e0..3af4b6b47fa2f 100644
--- a/pandas/tests/indexes/timedeltas/methods/test_insert.py
+++ b/pandas/tests/indexes/timedeltas/methods/test_insert.py
@@ -136,8 +136,8 @@ def test_insert_empty(self):
         result = idx[:0].insert(0, td)
         assert result.freq == "D"
 
-        result = idx[:0].insert(1, td)
-        assert result.freq == "D"
+        with pytest.raises(IndexError, match="loc must be an integer between"):
+            result = idx[:0].insert(1, td)
 
-        result = idx[:0].insert(-1, td)
-        assert result.freq == "D"
+        with pytest.raises(IndexError, match="loc must be an integer between"):
+            result = idx[:0].insert(-1, td)

From a3a349de719b060aa2ebd03b140ed5d2c4468c1d Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 22 Oct 2021 20:51:16 -0700
Subject: [PATCH 13/57] pass a few more tests

---
 pandas/_libs/index.pyx                 |  9 +++++-
 pandas/core/algorithms.py              |  2 --
 pandas/core/arrays/sparse/array.py     |  4 +--
 pandas/core/arrays/string_arrow.py     |  5 +++
 pandas/core/indexes/base.py            | 44 ++++++++++++++------------
 pandas/core/indexes/extension.py       | 25 ---------------
 pandas/tests/base/test_misc.py         |  3 ++
 pandas/tests/extension/base/getitem.py |  9 ++++++
 pandas/tests/indexes/common.py         | 30 +++++++++++++++---
 pandas/tests/indexes/test_any_index.py |  2 ++
 pandas/tests/indexing/test_indexing.py |  4 +++
 11 files changed, 82 insertions(+), 55 deletions(-)

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index 9bd93c50ef40c..2222bfd61664f 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -942,7 +942,14 @@ cdef class ExtensionEngine:
             return self.values == val
         except TypeError:
             # e.g. if __eq__ returns a BooleanArray instead of ndarry[bool]
-            return (self.values == val).to_numpy(dtype=bool, na_value=False)
+            try:
+                return (self.values == val).to_numpy(dtype=bool, na_value=False)
+            except (TypeError, AttributeError) as err:
+                # e.g. (self.values == val) returned a bool
+                #  see test_get_loc_generator[string[pyarrow]]
+                # e.g. self.value == val raises TypeError bc generator has no len
+                #  see test_get_loc_generator[string[python]]
+                raise KeyError from err
 
     cdef _maybe_get_bool_indexer(self, object val):
         # Returns ndarray[bool] or int
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 2c028ffc8c471..c1b587ce3a6b2 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1840,8 +1840,6 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike:
     -----
     Caller is responsible for ensuring lvals.dtype == rvals.dtype.
     """
-    from pandas.core.dtypes.concat import concat_compat
-
     indexer = []
     l_count = value_counts(lvals, dropna=False)
     r_count = value_counts(rvals, dropna=False)
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index 87fcf54ed684b..4d48fcfefd65c 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -942,10 +942,10 @@ def __getitem__(
                 # mypy doesn't know we have an array here
                 key = cast(np.ndarray, key)
                 return self.take(np.arange(len(key), dtype=np.int32)[key])
-            elif hasattr(key, "__len__"):
+            elif lib.is_list_like(key):
                 return self.take(key)
             else:
-                raise ValueError(f"Cannot slice with '{key}'")
+                raise IndexError(f"Cannot slice with '{key}'")
 
         return type(self)(data_slice, kind=self.kind)
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index ccd3e063c48a6..6858b55f6d3ef 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -324,6 +324,11 @@ def __getitem__(
                     item = item[1]
                 elif item[1] is Ellipsis:
                     item = item[0]
+        elif not (is_integer(item) or isinstance(item, slice) or item is Ellipsis):
+            raise IndexError(
+                "Only integers, slices and integer or "
+                "boolean arrays are valid indices."
+            )
 
         # We are not an array indexer, so maybe e.g. a slice or integer
         # indexer. We dispatch to pyarrow.
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 16dd757d20b5b..c57ff638acef0 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -4831,6 +4831,13 @@ def _validate_fill_value(self, value):
         TypeError
             If the value cannot be inserted into an array of this dtype.
         """
+        #if type(self) is Index and self.dtype != object:
+        #    # FIXME: kludge; work this into can_hold_element?
+        #    try:
+        #        type(self._values)._from_sequence([value], dtype=self.dtype)
+        #    except ValueError as err:
+        #        raise TypeError from err
+        #    return value
         if not can_hold_element(self._values, value):
             raise TypeError
         return value
@@ -5986,12 +5993,12 @@ def map(self, mapper, na_action=None):
                 new_values, dtype=dtype, copy=False, name=self.name
             )
 
-        res_values = maybe_cast_pointwise_result(
-            new_values, self.dtype, same_dtype=True
-        )
-        result = Index._with_infer(res_values, dtype=dtype, copy=False, name=self.name)
+        #res_values = maybe_cast_pointwise_result(
+        #    new_values, self.dtype, same_dtype=True
+        #)
+        result = Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name)
 
-        if False:  # type(self) is Index and not isinstance(self.dtype, np.dtype):
+        if type(self) is Index and not isinstance(self.dtype, np.dtype):
             # TODO: what about "integer-na"
             if self.dtype.kind in ["i", "u"] and result.inferred_type == "integer":
                 # TODO: worry about itemsize/overflows?
@@ -6474,26 +6481,21 @@ def insert(self, loc: int, item) -> Index:
         if is_valid_na_for_dtype(item, self.dtype) and self.dtype != object:
             item = self._na_value
 
+        arr = self._values
+
         try:
-            item = self._validate_fill_value(item)
-        except TypeError:
+            if isinstance(arr, ExtensionArray):
+                res_values = arr.insert(loc, item)
+                return type(self)._simple_new(res_values, name=self.name)
+            else:
+                item = self._validate_fill_value(item)
+        except (TypeError, ValueError):
+            # e.g. trying to insert an integer into a DatetimeIndex
+            #  We cannot keep the same dtype, so cast to the (often object)
+            #  minimal shared dtype before doing the insert.
             dtype = self._find_common_type_compat(item)
             return self.astype(dtype).insert(loc, item)
 
-        arr = self._values
-
-        if isinstance(arr, ExtensionArray):
-            # TODO: need EA.insert
-            try:
-                arr2 = type(arr)._from_sequence([item], dtype=arr.dtype)
-            except TypeError:
-                # TODO: make this into _validate_fill_value
-                dtype = self._find_common_type_compat(item)
-                return self.astype(dtype).insert(loc, item)
-
-            res_values = arr._concat_same_type([arr[:loc], arr2, arr[loc:]])
-            return type(self)._simple_new(res_values, name=self.name)
-
         if arr.dtype != object or not isinstance(
             item, (tuple, np.datetime64, np.timedelta64)
         ):
diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py
index ccd18f54da327..7c7f1b267b5be 100644
--- a/pandas/core/indexes/extension.py
+++ b/pandas/core/indexes/extension.py
@@ -134,31 +134,6 @@ class ExtensionIndex(Index):
 
     # ---------------------------------------------------------------------
 
-    def insert(self, loc: int, item) -> Index:
-        """
-        Make new Index inserting new item at location. Follows
-        Python list.append semantics for negative values.
-
-        Parameters
-        ----------
-        loc : int
-        item : object
-
-        Returns
-        -------
-        new_index : Index
-        """
-        try:
-            result = self._data.insert(loc, item)
-        except (ValueError, TypeError):
-            # e.g. trying to insert an integer into a DatetimeIndex
-            #  We cannot keep the same dtype, so cast to the (often object)
-            #  minimal shared dtype before doing the insert.
-            dtype = self._find_common_type_compat(item)
-            return self.astype(dtype).insert(loc, item)
-        else:
-            return type(self)._simple_new(result, name=self.name)
-
     def _validate_fill_value(self, value):
         """
         Convert value to be insertable to underlying array.
diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index c0250e2b3e958..2a51e4e559e14 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -11,6 +11,7 @@
 from pandas.core.dtypes.common import (
     is_categorical_dtype,
     is_object_dtype,
+    is_dtype_equal,
 )
 
 import pandas as pd
@@ -150,6 +151,8 @@ def test_access_by_position(index):
     assert index[-1] == index[size - 1]
 
     msg = f"index {size} is out of bounds for axis 0 with size {size}"
+    if is_dtype_equal(index.dtype, "string[pyarrow]"):
+        msg = "index out of bounds"
     with pytest.raises(IndexError, match=msg):
         index[size]
     msg = "single positional indexer is out-of-bounds"
diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py
index ac181af7875b5..72e0fabd3f200 100644
--- a/pandas/tests/extension/base/getitem.py
+++ b/pandas/tests/extension/base/getitem.py
@@ -120,6 +120,15 @@ def test_getitem_scalar(self, data):
         result = pd.Series(data)[0]
         assert isinstance(result, data.dtype.type)
 
+    def test_getitem_invalid(self, data):
+        # TODO: specific exception message?
+
+        with pytest.raises(IndexError):
+            data["foo"]
+
+        with pytest.raises(IndexError):
+            data[2.5]
+
     def test_getitem_scalar_na(self, data_missing, na_cmp, na_value):
         result = data_missing[0]
         assert na_cmp(result, na_value)
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index e9d09c6de64e9..9b5fb5dee6cbb 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -296,10 +296,15 @@ def test_ensure_copied_data(self, index):
             pass
         elif type(index) is Index and not isinstance(index.dtype, np.dtype):
             result = index_type(index.values, copy=False, **init_kwargs)
-            # FIXME: this is specific to MaskedArray
-            tm.assert_numpy_array_equal(index._values._data, result._values._data, check_same="same")
-            tm.assert_numpy_array_equal(index._values._mask, result._values._mask, check_same="same")
-
+            tm.assert_index_equal(result, index)
+
+            if hasattr(index._values, "_mask"):
+                # FIXME: this is specific to MaskedArray
+                tm.assert_numpy_array_equal(index._values._data, result._values._data, check_same="same")
+                tm.assert_numpy_array_equal(index._values._mask, result._values._mask, check_same="same")
+            else:
+                # e.g. string[pyarrow]
+                raise NotImplementedError
         else:
             result = index_type(index.values, copy=False, **init_kwargs)
             tm.assert_numpy_array_equal(index.values, result.values, check_same="same")
@@ -410,6 +415,23 @@ def test_insert_base(self, index):
         # test 0th element
         assert index[0:4].equals(result.insert(0, index[0]))
 
+    def test_insert_out_of_bounds(self, index):
+        # TypeError/IndexError matches what np.insert raises in these cases
+
+        # TODO: specific exception messags?
+        if len(index) > 0:
+            err = TypeError
+        else:
+            err = IndexError
+        with pytest.raises(err):
+            index.insert(0.5, "foo")
+
+        with pytest.raises(IndexError):
+            index.insert(len(index) + 1, 1)
+
+        with pytest.raises(IndexError):
+            index.insert(-len(index) - 1, 1)
+
     def test_delete_base(self, index):
         if not len(index):
             return
diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py
index 3f85cd59bff76..d855b547a1fca 100644
--- a/pandas/tests/indexes/test_any_index.py
+++ b/pandas/tests/indexes/test_any_index.py
@@ -148,6 +148,8 @@ def test_getitem_error(self, index, item):
                     "are valid indices"
                 ),
                 "index out of bounds",  # string[pyarrow]
+                "Only integers, slices and integer or "
+                "boolean arrays are valid indices.",  # string[pyarrow]
             ]
         )
         with pytest.raises(IndexError, match=msg):
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
index d5c1d6f1533d3..f12826bac7137 100644
--- a/pandas/tests/indexing/test_indexing.py
+++ b/pandas/tests/indexing/test_indexing.py
@@ -102,6 +102,10 @@ def test_getitem_ndarray_3d(
         if type(index) is Index and not isinstance(index._values, np.ndarray):
             # e.g. Int64
             msgs.append("values must be a 1D array")
+
+            # string[pyarrow]
+            msgs.append("only handle 1-dimensional arrays")
+
         msg = "|".join(msgs)
 
         potential_errors = (IndexError, ValueError, NotImplementedError)

From 4229dbfd0338e787ff5941ddd7bbbd0129af8fd0 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sat, 23 Oct 2021 13:22:44 -0700
Subject: [PATCH 14/57] tests

---
 pandas/core/indexes/base.py            | 15 ---------------
 pandas/tests/extension/base/getitem.py |  6 ++++++
 pandas/tests/extension/base/setitem.py |  7 +++++++
 3 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index c57ff638acef0..7c586bd6b25a8 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -26,7 +26,6 @@
     algos as libalgos,
     index as libindex,
     lib,
-    missing as libmissing,
 )
 import pandas._libs.join as libjoin
 from pandas._libs.lib import (
@@ -35,7 +34,6 @@
 )
 from pandas._libs.tslibs import (
     IncompatibleFrequency,
-    NaTType,
     OutOfBoundsDatetime,
     Timestamp,
     tz_compare,
@@ -69,7 +67,6 @@
     can_hold_element,
     find_common_type,
     infer_dtype_from,
-    maybe_cast_pointwise_result,
     validate_numeric_casting,
 )
 from pandas.core.dtypes.common import (
@@ -3640,7 +3637,6 @@ def get_indexer(
 
             indexer = self._engine.get_indexer(target.codes)
             if self.hasnans and target.hasnans:
-                # loc = self.get_loc(libmissing.NA)
                 loc = self.get_loc(np.nan)
                 mask = target.isna()
                 indexer[mask] = loc
@@ -3659,7 +3655,6 @@ def get_indexer(
                 # Exclude MultiIndex because hasnans raises NotImplementedError
                 # we should only get here if we are unique, so loc is an integer
                 # GH#41934
-                # loc = self.get_loc(libmissing.NA)
                 loc = self.get_loc(np.nan)
                 mask = target.isna()
                 indexer[mask] = loc
@@ -4831,13 +4826,6 @@ def _validate_fill_value(self, value):
         TypeError
             If the value cannot be inserted into an array of this dtype.
         """
-        #if type(self) is Index and self.dtype != object:
-        #    # FIXME: kludge; work this into can_hold_element?
-        #    try:
-        #        type(self._values)._from_sequence([value], dtype=self.dtype)
-        #    except ValueError as err:
-        #        raise TypeError from err
-        #    return value
         if not can_hold_element(self._values, value):
             raise TypeError
         return value
@@ -5993,9 +5981,6 @@ def map(self, mapper, na_action=None):
                 new_values, dtype=dtype, copy=False, name=self.name
             )
 
-        #res_values = maybe_cast_pointwise_result(
-        #    new_values, self.dtype, same_dtype=True
-        #)
         result = Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name)
 
         if type(self) is Index and not isinstance(self.dtype, np.dtype):
diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py
index 72e0fabd3f200..c8b9c03940672 100644
--- a/pandas/tests/extension/base/getitem.py
+++ b/pandas/tests/extension/base/getitem.py
@@ -129,6 +129,12 @@ def test_getitem_invalid(self, data):
         with pytest.raises(IndexError):
             data[2.5]
 
+        ub = len(data)
+        with pytest.raises(IndexError):
+            data[ub + 1]
+        with pytest.raises(IndexError):
+            data[-ub - 1]
+
     def test_getitem_scalar_na(self, data_missing, na_cmp, na_value):
         result = data_missing[0]
         assert na_cmp(result, na_value)
diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py
index 0392ea794237c..69b2b02bd4591 100644
--- a/pandas/tests/extension/base/setitem.py
+++ b/pandas/tests/extension/base/setitem.py
@@ -367,3 +367,10 @@ def test_delitem_series(self, data):
         expected = ser[taker]
         del ser[1]
         self.assert_series_equal(ser, expected)
+
+    def test_setitem_invalid(self, data, invalid_scalar):
+        with pytest.raises((ValueError, TypeError)):
+            data[0] = invalid_scalar
+
+        with pytest.raises((ValueError, TypeError)):
+            data[:] = invalid_scalar

From 2e1843a0cb811a577da156bd11a7990a8ee0c8a7 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sun, 24 Oct 2021 14:38:31 -0700
Subject: [PATCH 15/57] REF: share ExtensionIndex.insert-> Index.insert

---
 pandas/core/dtypes/missing.py       |  4 ++++
 pandas/core/indexes/base.py         | 15 +++++++++++----
 pandas/core/indexes/extension.py    | 25 -------------------------
 pandas/tests/dtypes/test_missing.py |  9 +++++++++
 4 files changed, 24 insertions(+), 29 deletions(-)

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index f5fbd4cc4a7fc..38553bc1be8d6 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -37,6 +37,7 @@
     needs_i8_conversion,
 )
 from pandas.core.dtypes.dtypes import (
+    CategoricalDtype,
     ExtensionDtype,
     IntervalDtype,
     PeriodDtype,
@@ -641,5 +642,8 @@ def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool:
     elif isinstance(dtype, IntervalDtype):
         return lib.is_float(obj) or obj is None or obj is libmissing.NA
 
+    elif isinstance(dtype, CategoricalDtype):
+        return is_valid_na_for_dtype(obj, dtype.categories.dtype)
+
     # fallback, default to allowing NaN, None, NA, NaT
     return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal))
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 05047540c6ccd..e82bd61938f15 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -6432,14 +6432,21 @@ def insert(self, loc: int, item) -> Index:
         if is_valid_na_for_dtype(item, self.dtype) and self.dtype != object:
             item = self._na_value
 
+        arr = self._values
+
         try:
-            item = self._validate_fill_value(item)
-        except TypeError:
+            if isinstance(arr, ExtensionArray):
+                res_values = arr.insert(loc, item)
+                return type(self)._simple_new(res_values, name=self.name)
+            else:
+                item = self._validate_fill_value(item)
+        except (TypeError, ValueError):
+            # e.g. trying to insert an integer into a DatetimeIndex
+            #  We cannot keep the same dtype, so cast to the (often object)
+            #  minimal shared dtype before doing the insert.
             dtype = self._find_common_type_compat(item)
             return self.astype(dtype).insert(loc, item)
 
-        arr = self._values
-
         if arr.dtype != object or not isinstance(
             item, (tuple, np.datetime64, np.timedelta64)
         ):
diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py
index ccd18f54da327..7c7f1b267b5be 100644
--- a/pandas/core/indexes/extension.py
+++ b/pandas/core/indexes/extension.py
@@ -134,31 +134,6 @@ class ExtensionIndex(Index):
 
     # ---------------------------------------------------------------------
 
-    def insert(self, loc: int, item) -> Index:
-        """
-        Make new Index inserting new item at location. Follows
-        Python list.append semantics for negative values.
-
-        Parameters
-        ----------
-        loc : int
-        item : object
-
-        Returns
-        -------
-        new_index : Index
-        """
-        try:
-            result = self._data.insert(loc, item)
-        except (ValueError, TypeError):
-            # e.g. trying to insert an integer into a DatetimeIndex
-            #  We cannot keep the same dtype, so cast to the (often object)
-            #  minimal shared dtype before doing the insert.
-            dtype = self._find_common_type_compat(item)
-            return self.astype(dtype).insert(loc, item)
-        else:
-            return type(self)._simple_new(result, name=self.name)
-
     def _validate_fill_value(self, value):
         """
         Convert value to be insertable to underlying array.
diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py
index bf68c4b79bcea..55d0e5e73418e 100644
--- a/pandas/tests/dtypes/test_missing.py
+++ b/pandas/tests/dtypes/test_missing.py
@@ -18,6 +18,7 @@
     is_scalar,
 )
 from pandas.core.dtypes.dtypes import (
+    CategoricalDtype,
     DatetimeTZDtype,
     IntervalDtype,
     PeriodDtype,
@@ -739,3 +740,11 @@ def test_is_valid_na_for_dtype_interval(self):
 
         dtype = IntervalDtype("datetime64[ns]", "both")
         assert not is_valid_na_for_dtype(NaT, dtype)
+
+    def test_is_valid_na_for_dtype_categorical(self):
+        dtype = CategoricalDtype(categories=[0, 1, 2])
+        assert is_valid_na_for_dtype(np.nan, dtype)
+
+        assert not is_valid_na_for_dtype(NaT, dtype)
+        assert not is_valid_na_for_dtype(np.datetime64("NaT", "ns"), dtype)
+        assert not is_valid_na_for_dtype(np.timedelta64("NaT", "ns"), dtype)

From 2bb1dea3c78a7ee95a82e0b0267a49619d18a0f3 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 26 Oct 2021 12:24:47 -0700
Subject: [PATCH 16/57] handle a few more tests

---
 pandas/core/indexes/base.py          |  2 ++
 pandas/tests/extension/test_numpy.py |  5 +++++
 pandas/tests/indexes/common.py       | 30 +++++++++++++++++++++-------
 3 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 7c586bd6b25a8..efa47ea7220e5 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -5993,6 +5993,8 @@ def map(self, mapper, na_action=None):
                 result = result.astype(self.dtype, copy=False)
             elif self.dtype == "boolean" and result.inferred_type == "boolean":
                 result = result.astype(self.dtype, copy=False)
+            elif self.dtype == "string" and result.inferred_type == "string":
+                result = result.astype(self.dtype, copy=False)
 
         return result
 
diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py
index 0e3e26e7e9500..8a643b999f464 100644
--- a/pandas/tests/extension/test_numpy.py
+++ b/pandas/tests/extension/test_numpy.py
@@ -446,6 +446,11 @@ def test_setitem_with_expansion_dataframe_column(self, data, full_indexer):
                 expected = pd.DataFrame({"data": data.to_numpy()})
         self.assert_frame_equal(result, expected)
 
+    @skip_nested
+    def test_setitem_invalid(self, data, invalid_scalar):
+        # _nothing_ is invalid for object dtype
+        super().test_setitem_invalid(data, invalid_scalar)
+
 
 @skip_nested
 class TestParsing(BaseNumPyTests, base.BaseParsingTests):
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index 9b5fb5dee6cbb..1ed001e619dd5 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -36,6 +36,7 @@
     Int64Index,
     UInt64Index,
 )
+from pandas.core.arrays import BaseMaskedArray
 
 
 class Base:
@@ -298,13 +299,26 @@ def test_ensure_copied_data(self, index):
             result = index_type(index.values, copy=False, **init_kwargs)
             tm.assert_index_equal(result, index)
 
-            if hasattr(index._values, "_mask"):
-                # FIXME: this is specific to MaskedArray
-                tm.assert_numpy_array_equal(index._values._data, result._values._data, check_same="same")
-                tm.assert_numpy_array_equal(index._values._mask, result._values._mask, check_same="same")
+            if isinstance(index._values, BaseMaskedArray):
+                assert np.shares_memory(index._values._data, result._values._data)
+                tm.assert_numpy_array_equal(
+                    index._values._data, result._values._data, check_same="same"
+                )
+                assert np.shares_memory(index._values._mask, result._values._mask)
+                tm.assert_numpy_array_equal(
+                    index._values._mask, result._values._mask, check_same="same"
+                )
+            elif index.dtype == "string[python]":
+                assert np.shares_memory(index._values._ndarray, result._values._ndarray)
+                tm.assert_numpy_array_equal(
+                    index._values._ndarray, result._values._ndarray, check_same="same"
+                )
+            elif index.dtype == "string[pyarrow]":
+                raise NotImplementedError(
+                    "How do we check that we don't have a copy? xref #44152"
+                )
             else:
-                # e.g. string[pyarrow]
-                raise NotImplementedError
+                raise NotImplementedError(index.dtype)
         else:
             result = index_type(index.values, copy=False, **init_kwargs)
             tm.assert_numpy_array_equal(index.values, result.values, check_same="same")
@@ -325,7 +339,9 @@ def test_memory_usage(self, index):
         # RangeIndex, IntervalIndex
         # don't have engines
         # Index[EA] has engine but it does not have a Hashtable .mapping
-        if not isinstance(index, (RangeIndex, IntervalIndex)) and not (type(index) is Index and not isinstance(index.dtype, np.dtype)):
+        if not isinstance(index, (RangeIndex, IntervalIndex)) and not (
+            type(index) is Index and not isinstance(index.dtype, np.dtype)
+        ):
             assert result2 > result
 
         if index.inferred_type == "object":

From 36b662908a960f824cf8a7392a61bdceb5850546 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 27 Oct 2021 12:56:36 -0700
Subject: [PATCH 17/57] update test

---
 pandas/tests/indexes/common.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index 1ed001e619dd5..db5f2fb78bbde 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -314,9 +314,12 @@ def test_ensure_copied_data(self, index):
                     index._values._ndarray, result._values._ndarray, check_same="same"
                 )
             elif index.dtype == "string[pyarrow]":
-                raise NotImplementedError(
-                    "How do we check that we don't have a copy? xref #44152"
-                )
+                # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
+                result_pa_data = result._values._data
+                index_pa_data = index._values._data
+                res_buf1 = result_pa_data.chunk(0).buffers()[1]
+                idx_buf1 = index_pa_data.chunk(0).buffers()[1]
+                assert res_buf1.address == idx_buf1.address
             else:
                 raise NotImplementedError(index.dtype)
         else:
@@ -434,7 +437,7 @@ def test_insert_base(self, index):
     def test_insert_out_of_bounds(self, index):
         # TypeError/IndexError matches what np.insert raises in these cases
 
-        # TODO: specific exception messags?
+        # TODO: specific exception messages?
         if len(index) > 0:
             err = TypeError
         else:

From 1881599fd0041d6a4f0048123d034802fc8a41bd Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 29 Oct 2021 12:21:25 -0700
Subject: [PATCH 18/57] Fix remaining tests

---
 pandas/core/algorithms.py                |  2 +-
 pandas/core/indexes/base.py              | 24 +++++++++-----------
 pandas/tests/base/test_misc.py           |  9 ++++++--
 pandas/tests/extension/json/array.py     | 10 ++++++++-
 pandas/tests/extension/json/test_json.py | 14 ++++++++++++
 pandas/tests/indexes/common.py           | 28 +++++-------------------
 pandas/tests/indexes/test_any_index.py   | 14 +-----------
 pandas/tests/indexes/test_base.py        |  5 +++++
 pandas/tests/indexes/test_setops.py      | 15 ++++++++++++-
 pandas/tests/test_algos.py               |  3 +--
 10 files changed, 68 insertions(+), 56 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index c1b587ce3a6b2..a1005d5f2cf9f 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1848,5 +1848,5 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike:
     unique_array = ensure_wrapped_if_datetimelike(unique_array)
 
     for i, value in enumerate(unique_array):
-        indexer += [i] * int(max(l_count[value], r_count[value]))
+        indexer += [i] * int(max(l_count.at[value], r_count.at[value]))
     return unique_array.take(indexer)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index efa47ea7220e5..9b741ecd8ac1a 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -67,6 +67,7 @@
     can_hold_element,
     find_common_type,
     infer_dtype_from,
+    maybe_cast_pointwise_result,
     validate_numeric_casting,
 )
 from pandas.core.dtypes.common import (
@@ -5974,6 +5975,15 @@ def map(self, mapper, na_action=None):
             # empty
             dtype = self.dtype
 
+        # e.g. if we are floating and new_values is all ints, then we
+        #  don't want to cast back to floating.  But if we are UInt64
+        #  and new_values is all ints, we want to try.
+        same_dtype = lib.infer_dtype(new_values, skipna=False) == self.inferred_type
+        if same_dtype:
+            new_values = maybe_cast_pointwise_result(
+                new_values, self.dtype, same_dtype=same_dtype
+            )
+
         if self._is_backward_compat_public_numeric_index and is_numeric_dtype(
             new_values.dtype
         ):
@@ -5982,20 +5992,6 @@ def map(self, mapper, na_action=None):
             )
 
         result = Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name)
-
-        if type(self) is Index and not isinstance(self.dtype, np.dtype):
-            # TODO: what about "integer-na"
-            if self.dtype.kind in ["i", "u"] and result.inferred_type == "integer":
-                # TODO: worry about itemsize/overflows?
-                result = result.astype(self.dtype, copy=False)
-            elif self.dtype.kind == "f" and result.inferred_type == "floating":
-                # TODO: worry about itemsize/overflows?
-                result = result.astype(self.dtype, copy=False)
-            elif self.dtype == "boolean" and result.inferred_type == "boolean":
-                result = result.astype(self.dtype, copy=False)
-            elif self.dtype == "string" and result.inferred_type == "string":
-                result = result.astype(self.dtype, copy=False)
-
         return result
 
     # TODO: De-duplicate with map, xref GH#32349
diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index 2a51e4e559e14..028fa45658d15 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -10,8 +10,8 @@
 
 from pandas.core.dtypes.common import (
     is_categorical_dtype,
-    is_object_dtype,
     is_dtype_equal,
+    is_object_dtype,
 )
 
 import pandas as pd
@@ -77,12 +77,16 @@ def test_memory_usage(index_or_series_obj):
     res = obj.memory_usage()
     res_deep = obj.memory_usage(deep=True)
 
+    is_ser = isinstance(obj, Series)
     is_object = is_object_dtype(obj) or (
         isinstance(obj, Series) and is_object_dtype(obj.index)
     )
     is_categorical = is_categorical_dtype(obj.dtype) or (
         isinstance(obj, Series) and is_categorical_dtype(obj.index.dtype)
     )
+    is_string = is_dtype_equal(obj, "string[python]") or (
+        is_ser and is_dtype_equal(obj.index.dtype, "string[python]")
+    )
 
     if len(obj) == 0:
         if isinstance(obj, Index):
@@ -90,9 +94,10 @@ def test_memory_usage(index_or_series_obj):
         else:
             expected = 108 if IS64 else 64
         assert res_deep == res == expected
-    elif is_object or is_categorical:
+    elif is_object or is_categorical or is_string:
         # only deep will pick them up
         assert res_deep > res
+        assert res_deep > res
     else:
         assert res == res_deep
 
diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py
index 2eef828288e59..7bcd666f08b12 100644
--- a/pandas/tests/extension/json/array.py
+++ b/pandas/tests/extension/json/array.py
@@ -39,7 +39,10 @@
     ExtensionArray,
     ExtensionDtype,
 )
-from pandas.api.types import is_bool_dtype
+from pandas.api.types import (
+    is_bool_dtype,
+    is_list_like,
+)
 
 
 class JSONDtype(ExtensionDtype):
@@ -103,6 +106,11 @@ def __getitem__(self, item):
         elif isinstance(item, slice):
             # slice
             return type(self)(self.data[item])
+        elif not is_list_like(item):
+            raise IndexError(
+                "Only integers, slices and integer or "
+                "boolean arrays are valid indices."
+            )
         else:
             item = pd.api.indexers.check_array_indexer(self, item)
             if is_bool_dtype(item.dtype):
diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
index f090396a70724..d5ecd8909319e 100644
--- a/pandas/tests/extension/json/test_json.py
+++ b/pandas/tests/extension/json/test_json.py
@@ -306,6 +306,20 @@ def test_groupby_extension_apply(self):
         we'll be able to dispatch unique.
         """
 
+    @unhashable
+    def test_groupby_extension_agg(self):
+        """
+        This fails when we get to tm.assert_series_equal when left.index
+        contains dictionaries, which are not hashable.
+        """
+
+    @unhashable
+    def test_groupby_extension_no_sort(self):
+        """
+        This fails when we get to tm.assert_series_equal when left.index
+        contains dictionaries, which are not hashable.
+        """
+
     @pytest.mark.xfail(reason="GH#39098: Converts agg result to object")
     def test_groupby_agg_extension(self, data_for_grouping):
         super().test_groupby_agg_extension(data_for_grouping)
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index db5f2fb78bbde..53b109139cc65 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -10,9 +10,7 @@
 
 from pandas.core.dtypes.common import (
     is_datetime64tz_dtype,
-    is_float_dtype,
     is_integer_dtype,
-    is_unsigned_integer_dtype,
 )
 from pandas.core.dtypes.dtypes import CategoricalDtype
 
@@ -641,20 +639,9 @@ def test_map(self, simple_index):
         # callable
         idx = simple_index
 
-        # we don't infer UInt64
-        if is_integer_dtype(idx.dtype):
-            expected = idx.astype("int64")
-        elif is_float_dtype(idx.dtype):
-            expected = idx.astype("float64")
-            if idx._is_backward_compat_public_numeric_index:
-                # We get a NumericIndex back, not Float64Index
-                expected = type(idx)(expected)
-        else:
-            expected = idx
-
         result = idx.map(lambda x: x)
         # For RangeIndex we convert to Int64Index
-        tm.assert_index_equal(result, expected, exact="equiv")
+        tm.assert_index_equal(result, idx, exact="equiv")
 
     @pytest.mark.parametrize(
         "mapper",
@@ -671,23 +658,20 @@ def test_map_dictlike(self, mapper, simple_index):
 
         identity = mapper(idx.values, idx)
 
-        # we don't infer to UInt64 for a dict
-        if is_unsigned_integer_dtype(idx.dtype) and isinstance(identity, dict):
-            expected = idx.astype("int64")
-        else:
-            expected = idx
-
         result = idx.map(identity)
         # For RangeIndex we convert to Int64Index
-        tm.assert_index_equal(result, expected, exact="equiv")
+        tm.assert_index_equal(result, idx, exact="equiv")
 
         # empty mappable
+        dtype = None
         if idx._is_backward_compat_public_numeric_index:
             new_index_cls = NumericIndex
+            if idx.dtype.kind == "f":
+                dtype = idx.dtype
         else:
             new_index_cls = Float64Index
 
-        expected = new_index_cls([np.nan] * len(idx))
+        expected = new_index_cls([np.nan] * len(idx), dtype=dtype)
         result = idx.map(mapper(expected, idx))
         tm.assert_index_equal(result, expected)
 
diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py
index 3762c0efe1cf8..29d9eb9909338 100644
--- a/pandas/tests/indexes/test_any_index.py
+++ b/pandas/tests/indexes/test_any_index.py
@@ -3,11 +3,8 @@
 """
 import re
 
-import numpy as np
 import pytest
 
-from pandas.core.dtypes.common import is_float_dtype
-
 import pandas._testing as tm
 
 
@@ -49,16 +46,7 @@ def test_mutability(index):
 def test_map_identity_mapping(index):
     # GH#12766
     result = index.map(lambda x: x)
-    if index._is_backward_compat_public_numeric_index:
-        if is_float_dtype(index.dtype):
-            expected = index.astype(np.float64)
-        elif index.dtype == np.uint64:
-            expected = index.astype(np.uint64)
-        else:
-            expected = index.astype(np.int64)
-    else:
-        expected = index
-    tm.assert_index_equal(result, expected, exact="equiv")
+    tm.assert_index_equal(result, index, exact="equiv")
 
 
 def test_wrong_number_names(index):
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index 9abf1abbc4365..9bcbeb411e51f 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -722,10 +722,15 @@ def test_map_dictlike(self, index, mapper):
                 exp_dtype = np.float64
             else:
                 exp_dtype = np.int64
+            exp_dtype = index.dtype
             expected = index._constructor(np.arange(len(index), 0, -1), dtype=exp_dtype)
         elif type(index) is Index and index.dtype != object:
             # i.e. EA-backed, for now just Nullable
             expected = Index(np.arange(len(index), 0, -1), dtype=index.dtype)
+        elif index.dtype.kind == "u":
+            # TODO: case where e.g. we cannot hold result in UInt8?
+            expected = Index(np.arange(len(index), 0, -1), dtype=index.dtype)
+
         else:
             expected = Index(np.arange(len(index), 0, -1))
 
diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py
index a0e97223435e6..bec0a41b9bb6b 100644
--- a/pandas/tests/indexes/test_setops.py
+++ b/pandas/tests/indexes/test_setops.py
@@ -9,6 +9,7 @@
 import pytest
 
 from pandas.core.dtypes.cast import find_common_type
+from pandas.core.dtypes.common import is_dtype_equal
 
 from pandas import (
     CategoricalIndex,
@@ -46,12 +47,24 @@ def test_union_same_types(index):
     assert idx1.union(idx2).dtype == idx1.dtype
 
 
-def test_union_different_types(index_flat, index_flat2):
+def test_union_different_types(index_flat, index_flat2, request):
     # This test only considers combinations of indices
     # GH 23525
     idx1 = index_flat
     idx2 = index_flat2
 
+    if (
+        not idx1.is_unique
+        and idx1.dtype.kind == "i"
+        and is_dtype_equal(idx2.dtype, "boolean")
+    ) or (
+        not idx2.is_unique
+        and idx2.dtype.kind == "i"
+        and is_dtype_equal(idx1.dtype, "boolean")
+    ):
+        mark = pytest.mark.xfail(reason="GH#44000 True==1", raises=ValueError)
+        request.node.add_marker(mark)
+
     common_dtype = find_common_type([idx1.dtype, idx2.dtype])
 
     any_uint64 = idx1.dtype == np.uint64 or idx2.dtype == np.uint64
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 779d6e6b6bb0f..2c7b5a7cccd95 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -2446,6 +2446,5 @@ def test_union_with_duplicates(op):
         result = algos.union_with_duplicates(lvals, rvals)
         tm.assert_numpy_array_equal(result, expected)
     else:
-        with tm.assert_produces_warning(RuntimeWarning):
-            result = algos.union_with_duplicates(lvals, rvals)
+        result = algos.union_with_duplicates(lvals, rvals)
         tm.assert_extension_array_equal(result, expected)

From 1f973251c1014e540d42316d3569f59d50ceb12a Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 29 Oct 2021 13:48:22 -0700
Subject: [PATCH 19/57] no-pyarrow-compat

---
 pandas/conftest.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index ef48dc0055615..b87a5e286c640 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -66,6 +66,14 @@
     MultiIndex,
 )
 
+try:
+    import pyarrow as pa
+except ImportError:
+    has_pyarrow = False
+else:
+    del pa
+    has_pyarrow = True
+
 # Until https://github.com/numpy/numpy/issues/19078 is sorted out, just suppress
 suppress_npdev_promotion_warning = pytest.mark.filterwarnings(
     "ignore:Promotion of numbers and bools:FutureWarning"
@@ -518,8 +526,10 @@ def _create_mi_with_dt64tz_level():
     "nullable_float": Index(np.arange(100), dtype="Float32"),
     "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"),
     "string-python": Index(pd.array(tm.makeStringIndex(100), dtype="string[python]")),
-    "string-pyarrow": Index(pd.array(tm.makeStringIndex(100), dtype="string[pyarrow]")),
 }
+if has_pyarrow:
+    idx = Index(pd.array(tm.makeStringIndex(100), dtype="string[pyarrow]"))
+    indices_dict["string-pyarrow"] = idx
 
 
 @pytest.fixture(params=indices_dict.keys())

From 076cada69b3155de6e15120c2f3f737511e6b794 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 29 Oct 2021 13:51:42 -0700
Subject: [PATCH 20/57] mypy fixups

---
 pandas/_libs/index.pyi       |  6 ++++++
 pandas/_libs/testing.pyx     | 13 ++-----------
 pandas/core/arrays/masked.py | 11 ++---------
 pandas/core/indexes/base.py  | 11 ++++++-----
 4 files changed, 16 insertions(+), 25 deletions(-)

diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi
index 446a980487cde..fd521fc446690 100644
--- a/pandas/_libs/index.pyi
+++ b/pandas/_libs/index.pyi
@@ -63,3 +63,9 @@ class BaseMultiIndexCodesEngine:
         method: str,
         limit: int | None,
     ) -> npt.NDArray[np.intp]: ...
+
+class NullableEngine:
+    pass
+
+class ExtensionEngine:
+    pass
diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx
index d2566123815f1..cfe9f40f12452 100644
--- a/pandas/_libs/testing.pyx
+++ b/pandas/_libs/testing.pyx
@@ -7,7 +7,6 @@ from numpy cimport import_array
 
 import_array()
 
-from pandas._libs.missing cimport is_matching_na
 from pandas._libs.util cimport (
     is_array,
     is_complex_object,
@@ -176,18 +175,10 @@ cpdef assert_almost_equal(a, b,
         assert_class_equal(a, b, obj=obj)
 
     if isna(a) and isna(b):
+        # TODO: Should require same-dtype NA?
+        # nan / None comparison
         return True
-        #if isna(b):
-        #    # TODO: Should require same-dtype NA?
-        #    # nan / None comparison
-        #    return True
-        #
-        #assert False, f"expected {a} but got {b}"
 
-    #elif isna(b):
-    #    assert False, f"expected {a} but got {b}"
-
-    # TODO: test for tm.assert_whatever with pd.NA that would raise here
     if a == b:
         # object comparison
         return True
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 51429a4fd0418..b49f995ec35f7 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -614,10 +614,7 @@ def value_counts(self, dropna: bool = True) -> Series:
         data = self._data[~self._mask]
         value_counts = Index(data).value_counts()
 
-        # TODO(extension)
-        # if we have allow Index to hold an ExtensionArray
-        # this is easier
-        index = value_counts.index  # ._values.astype(object)
+        index = value_counts.index
 
         # if we want nans, count the mask
         if dropna:
@@ -628,12 +625,8 @@ def value_counts(self, dropna: bool = True) -> Series:
             counts[-1] = self._mask.sum()
 
             index = index.insert(len(index), self.dtype.na_value)
-            # index = Index(
-            #    np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]),
-            #    dtype=object,
-            # )
 
-        index = index.astype(self.dtype)
+        assert index.dtype == self.dtype
 
         mask = np.zeros(len(counts), dtype="bool")
         counts = IntegerArray(counts, mask)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 9b741ecd8ac1a..ca82622b98cbe 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -360,7 +360,10 @@ def _outer_indexer(
 
     _typ: str = "index"
     _data: ExtensionArray | np.ndarray
-    _data_cls: type[np.ndarray] | type[ExtensionArray] = (np.ndarray, ExtensionArray)
+    _data_cls: tuple[type[np.ndarray], type[ExtensionArray]] = (
+        np.ndarray,
+        ExtensionArray,
+    )
     _id: object | None = None
     _name: Hashable = None
     # MultiIndex.levels previously allowed setting the index name. We
@@ -4452,8 +4455,7 @@ def _join_non_unique(
         if isinstance(join_array, np.ndarray):
             np.putmask(join_array, mask, right)
         else:
-            # error: "ExtensionArray" has no attribute "putmask"
-            join_array.putmask(mask, right)  # type: ignore[attr-defined]
+            join_array.putmask(mask, right)
 
         join_index = self._wrap_joined_index(join_array, other)
 
@@ -5057,8 +5059,7 @@ def putmask(self, mask, value) -> Index:
         else:
             # Note: we use the original value here, not converted, as
             #  _validate_fill_value is not idempotent
-            # error: "ExtensionArray" has no attribute "putmask"
-            values.putmask(mask, value)  # type: ignore[attr-defined]
+            values.putmask(mask, value)
 
         return self._shallow_copy(values)
 

From 1e8a31f321f453fcad5e9bde511cb70753ea46c1 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 29 Oct 2021 22:00:14 -0700
Subject: [PATCH 21/57] remove assertion

---
 pandas/core/indexes/base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index ca82622b98cbe..65c401a94d8b2 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -844,7 +844,6 @@ def _engine(self) -> libindex.IndexEngine:
         ):
             return libindex.ExtensionEngine(self._values)
 
-        assert self.dtype != "boolean"
         # to avoid a reference cycle, bind `target_values` to a local variable, so
         # `self` is not passed into the lambda.
         target_values = self._get_engine_target()

From 2d5fa6d60363bdee1aa85c87d40d5b436104a2b3 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sun, 31 Oct 2021 11:54:15 -0700
Subject: [PATCH 22/57] restor astype

---
 pandas/core/arrays/masked.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index b49f995ec35f7..e8b2d0791a2de 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -626,7 +626,7 @@ def value_counts(self, dropna: bool = True) -> Series:
 
             index = index.insert(len(index), self.dtype.na_value)
 
-        assert index.dtype == self.dtype
+        index = index.astype(self.dtype)
 
         mask = np.zeros(len(counts), dtype="bool")
         counts = IntegerArray(counts, mask)

From adf3ddbc84b5a80e868d52f9c0ca96f7461be2e8 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sun, 31 Oct 2021 18:49:41 -0700
Subject: [PATCH 23/57] older numpy compat

---
 pandas/_libs/index.pyx | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index 2222bfd61664f..7499011e1060e 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -1028,7 +1028,11 @@ cdef class ExtensionEngine:
 
             indexer.append(locs)
 
-        indexer = np.concatenate(indexer, dtype=np.intp)
+        try:
+            indexer = np.concatenate(indexer, dtype=np.intp)
+        except TypeError:
+            # numpy<1.20 doesn't accept dtype keyword
+            indexer = np.concatenate(indexer).astype(np.intp, copy=False)
         missing = np.array(missing, dtype=np.intp)
 
         return indexer, missing
@@ -1300,7 +1304,11 @@ cdef class NullableEngine:
 
             indexer.append(locs)
 
-        indexer = np.concatenate(indexer, dtype=np.intp)
+        try:
+            indexer = np.concatenate(indexer, dtype=np.intp)
+        except TypeError:
+            # numpy<1.20 doesn't accept dtype keyword
+            indexer = np.concatenate(indexer).astype(np.intp, copy=False)
         missing = np.array(missing, dtype=np.intp)
 
         return indexer, missing

From fd6880e572d0991be6eba7e23645c1e5986f6c00 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 1 Nov 2021 12:47:46 -0700
Subject: [PATCH 24/57] xfail

---
 pandas/tests/base/test_misc.py        | 4 +++-
 pandas/tests/extension/test_string.py | 9 ++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index 028fa45658d15..dbf76149ed3cd 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -6,6 +6,7 @@
 from pandas.compat import (
     IS64,
     PYPY,
+    pa_version_under1p0,
 )
 
 from pandas.core.dtypes.common import (
@@ -156,7 +157,8 @@ def test_access_by_position(index):
     assert index[-1] == index[size - 1]
 
     msg = f"index {size} is out of bounds for axis 0 with size {size}"
-    if is_dtype_equal(index.dtype, "string[pyarrow]"):
+    if pa_version_under1p0 and is_dtype_equal(index.dtype, "string[pyarrow]"):
+        # TODO(GH#44276) pa_version_under1p0 check should be unnecessary
         msg = "index out of bounds"
     with pytest.raises(IndexError, match=msg):
         index[size]
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 5049116a9320e..af6c149447d15 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -18,6 +18,8 @@
 import numpy as np
 import pytest
 
+from pandas.compat import pa_version_under2p0
+
 import pandas as pd
 from pandas.core.arrays import ArrowStringArray
 from pandas.core.arrays.string_ import StringDtype
@@ -186,7 +188,12 @@ class TestPrinting(base.BasePrintingTests):
 
 
 class TestGroupBy(base.BaseGroupbyTests):
-    pass
+    def test_groupby_extension_transform(self, data_for_grouping, request):
+        if data_for_grouping.dtype.storage == "pyarrow" and pa_version_under2p0:
+            # failure observed in 1.0.1, not in 2.0 or later
+            mark = pytest.mark.xfail(reason="pyarrow raises in self._data[item]")
+            request.node.add_marker(mark)
+        super().test_groupby_extension_transform(data_for_grouping)
 
 
 class Test2DCompat(base.Dim2CompatTests):

From 3d9b9af269c1ac24ebd2d7128ba2371c97a11fda Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 1 Nov 2021 16:52:07 -0700
Subject: [PATCH 25/57] mypy fixup

---
 pandas/_libs/index.pyi          | 42 +++++++++++++++++++++++++++++++--
 pandas/_testing/asserters.py    | 12 +++++-----
 pandas/core/indexes/base.py     | 21 ++++++++++-------
 pandas/core/indexes/interval.py |  4 +++-
 pandas/core/indexes/multi.py    |  7 ++++++
 pandas/tests/base/test_misc.py  | 10 +++++---
 6 files changed, 75 insertions(+), 21 deletions(-)

diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi
index fd521fc446690..21f1c3e147a59 100644
--- a/pandas/_libs/index.pyi
+++ b/pandas/_libs/index.pyi
@@ -1,9 +1,17 @@
+from typing import TYPE_CHECKING
+
 import numpy as np
 
 from pandas._typing import npt
 
 from pandas import MultiIndex
 
+if TYPE_CHECKING:
+    from pandas.core.arrays import (
+        BaseMaskedArray,
+        ExtensionArray,
+    )
+
 class IndexEngine:
     over_size_threshold: bool
     def __init__(self, values: np.ndarray): ...
@@ -65,7 +73,37 @@ class BaseMultiIndexCodesEngine:
     ) -> npt.NDArray[np.intp]: ...
 
 class NullableEngine:
-    pass
+    def __init__(self, values: "BaseMaskedArray"): ...
+    def __contains__(self, val: object) -> bool: ...
+    def get_loc(self, val: object) -> int | slice | np.ndarray: ...
+    def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ...
+    def get_indexer_non_unique(
+        self,
+        targets: np.ndarray,
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
+    @property
+    def is_unique(self) -> bool: ...
+    @property
+    def is_monotonic_increasing(self) -> bool: ...
+    @property
+    def is_monotonic_decreasing(self) -> bool: ...
+    def sizeof(self, deep: bool = ...) -> int: ...
+    def clear_mapping(self): ...
 
 class ExtensionEngine:
-    pass
+    def __init__(self, values: "ExtensionArray"): ...
+    def __contains__(self, val: object) -> bool: ...
+    def get_loc(self, val: object) -> int | slice | np.ndarray: ...
+    def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ...
+    def get_indexer_non_unique(
+        self,
+        targets: np.ndarray,
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
+    @property
+    def is_unique(self) -> bool: ...
+    @property
+    def is_monotonic_increasing(self) -> bool: ...
+    @property
+    def is_monotonic_decreasing(self) -> bool: ...
+    def sizeof(self, deep: bool = ...) -> int: ...
+    def clear_mapping(self): ...
diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
index de5057f56fae0..696b237935dd2 100644
--- a/pandas/_testing/asserters.py
+++ b/pandas/_testing/asserters.py
@@ -42,6 +42,7 @@
     take_nd,
 )
 from pandas.core.arrays import (
+    BaseMaskedArray,
     DatetimeArray,
     ExtensionArray,
     IntervalArray,
@@ -403,15 +404,14 @@ def _get_ilevel_values(index, level):
         if not left.equals(right):
             mismatch = left._values != right._values
 
-            if not isinstance(mismatch, np.ndarray):
-                # i.e. its a MaskedArray
+            if isinstance(mismatch, BaseMaskedArray):
+                lvalues = cast(BaseMaskedArray, left._values)
+                rvalues = cast(BaseMaskedArray, right._values)
                 mismatch = mismatch.to_numpy(dtype=int, na_value=0)
-                mismask = left._values._mask ^ right._values._mask
+                mismask = lvalues._mask ^ rvalues._mask
                 mismatch[mismask] = 1
 
-            diff = (
-                np.sum(mismatch.astype(int)) * 100.0 / len(left)
-            )
+            diff = np.sum(mismatch.astype(int)) * 100.0 / len(left)
             msg = f"{obj} values are different ({np.round(diff, 5)} %)"
             raise_assert_detail(obj, msg, left, right)
     else:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 430965491bf61..ea4a3d222581c 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -360,7 +360,7 @@ def _outer_indexer(
 
     _typ: str = "index"
     _data: ExtensionArray | np.ndarray
-    _data_cls: tuple[type[np.ndarray], type[ExtensionArray]] = (
+    _data_cls: type[ExtensionArray] | tuple[type[np.ndarray], type[ExtensionArray]] = (
         np.ndarray,
         ExtensionArray,
     )
@@ -381,7 +381,9 @@ def _outer_indexer(
     # associated code in pandas 2.0.
     _is_backward_compat_public_numeric_index: bool = False
 
-    _engine_type: type[libindex.IndexEngine] = libindex.ObjectEngine
+    _engine_type: type[libindex.IndexEngine] | type[libindex.NullableEngine] | type[
+        libindex.ExtensionEngine
+    ] = libindex.ObjectEngine
     # whether we support partial string indexing. Overridden
     # in DatetimeIndex and PeriodIndex
     _supports_partial_string_indexing = False
@@ -833,7 +835,9 @@ def _cleanup(self) -> None:
         self._engine.clear_mapping()
 
     @cache_readonly
-    def _engine(self) -> libindex.IndexEngine:
+    def _engine(
+        self,
+    ) -> libindex.IndexEngine | libindex.NullableEngine | libindex.ExtensionEngine:
         # For base class (object dtype) we get ObjectEngine
 
         if isinstance(self._values, BaseMaskedArray):
@@ -844,10 +848,11 @@ def _engine(self) -> libindex.IndexEngine:
         ):
             return libindex.ExtensionEngine(self._values)
 
+        engine_type = cast(type[libindex.IndexEngine], self._engine_type)
         # to avoid a reference cycle, bind `target_values` to a local variable, so
         # `self` is not passed into the lambda.
         target_values = self._get_engine_target()
-        return self._engine_type(target_values)
+        return engine_type(target_values)
 
     @final
     @cache_readonly
@@ -3711,7 +3716,7 @@ def _get_indexer(
             if target._is_multi and self._is_multi:
                 engine = self._engine
                 # error: "IndexEngine" has no attribute "_extract_level_codes"
-                tgt_values = engine._extract_level_codes(  # type: ignore[attr-defined]
+                tgt_values = engine._extract_level_codes(  # type: ignore[union-attr]
                     target
                 )
 
@@ -3791,7 +3796,7 @@ def _get_fill_indexer(
             # TODO: get_indexer_with_fill docstring says values must be _sorted_
             #  but that doesn't appear to be enforced
             # error: "IndexEngine" has no attribute "get_indexer_with_fill"
-            return self._engine.get_indexer_with_fill(  # type: ignore[attr-defined]
+            return self._engine.get_indexer_with_fill(  # type: ignore[union-attr]
                 target=target._values, values=self._values, method=method, limit=limit
             )
 
@@ -5631,9 +5636,7 @@ def get_indexer_non_unique(
         if self._is_multi and target._is_multi:
             engine = self._engine
             # error: "IndexEngine" has no attribute "_extract_level_codes"
-            tgt_values = engine._extract_level_codes(  # type: ignore[attr-defined]
-                target
-            )
+            tgt_values = engine._extract_level_codes(target)  # type: ignore[union-attr]
 
         indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
         return ensure_platform_int(indexer), ensure_platform_int(missing)
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index ed8072f7b0dd5..7c61b31b8ef66 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -731,7 +731,9 @@ def _get_indexer_pointwise(
                     locs = np.array(locs, ndmin=1)
                 else:
                     # FIXME: This is wrong; its boolean; not reached
-                    assert locs.dtype.kind == "i"
+                    # error: Item "int" of "Union[int, ndarray[Any, Any]]"
+                    #  has no attribute "dtype"
+                    assert locs.dtype.kind == "i"  # type: ignore[union-attr]
 
             except KeyError:
                 missing.append(i)
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index fe97d61be7548..febd0ce1279e8 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -283,6 +283,13 @@ class MultiIndex(Index):
     of the mentioned helper methods.
     """
 
+    # error: Incompatible types in assignment (expression has type
+    # "Type[BaseMultiIndexCodesEngine]", base class "Index" defined the type as
+    # "Union[Type[IndexEngine], Type[NullableEngine], Type[ExtensionEngine]]")
+    _engine_type: type[
+        libindex.BaseMultiIndexCodesEngine
+    ] = libindex.BaseMultiIndexCodesEngine  # type: ignore[assignment]
+
     _hidden_attrs = Index._hidden_attrs | frozenset()
 
     # initialize to zero-length tuples to make everything work
diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index dbf76149ed3cd..d4708cb4dcfcc 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -6,7 +6,6 @@
 from pandas.compat import (
     IS64,
     PYPY,
-    pa_version_under1p0,
 )
 
 from pandas.core.dtypes.common import (
@@ -157,8 +156,13 @@ def test_access_by_position(index):
     assert index[-1] == index[size - 1]
 
     msg = f"index {size} is out of bounds for axis 0 with size {size}"
-    if pa_version_under1p0 and is_dtype_equal(index.dtype, "string[pyarrow]"):
-        # TODO(GH#44276) pa_version_under1p0 check should be unnecessary
+    try:
+        eq = is_dtype_equal(index.dtype, "string[pyarrow]")
+    except ImportError:
+        # TODO(GH#44276) is_dtype_equal can raise here
+        eq = False
+
+    if eq:
         msg = "index out of bounds"
     with pytest.raises(IndexError, match=msg):
         index[size]

From 2d75377c5812d3d62fab6c395bdc171272d23cd9 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 2 Nov 2021 10:02:37 -0700
Subject: [PATCH 26/57] lint fixups

---
 pandas/_libs/lib.pxd                   | 1 +
 pandas/core/indexes/base.py            | 2 +-
 pandas/core/indexing.py                | 4 +++-
 pandas/tests/base/test_value_counts.py | 4 ++--
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/lib.pxd b/pandas/_libs/lib.pxd
index 1306960b403e2..46a339f2e7cbb 100644
--- a/pandas/_libs/lib.pxd
+++ b/pandas/_libs/lib.pxd
@@ -1,5 +1,6 @@
 from numpy cimport ndarray
 
+
 cdef bint c_is_list_like(object, bint) except -1
 
 cpdef ndarray eq_NA_compat(ndarray[object] arr, object key)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index ea4a3d222581c..9f2ca205d9efe 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -848,7 +848,7 @@ def _engine(
         ):
             return libindex.ExtensionEngine(self._values)
 
-        engine_type = cast(type[libindex.IndexEngine], self._engine_type)
+        engine_type = cast("type[libindex.IndexEngine]", self._engine_type)
         # to avoid a reference cycle, bind `target_values` to a local variable, so
         # `self` is not passed into the lambda.
         target_values = self._get_engine_target()
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index a78cc6bbb8bee..68e914a186381 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -993,7 +993,9 @@ def _validate_key(self, key, axis: int):
         # slice of labels (where start-end in labels)
         # slice of integers (only if in the labels)
         # boolean not in slice and with boolean index
-        if isinstance(key, bool) and not (is_bool_dtype(self.obj.index) or self.obj.index.dtype.name == "boolean"):
+        if isinstance(key, bool) and not (
+            is_bool_dtype(self.obj.index) or self.obj.index.dtype.name == "boolean"
+        ):
             raise KeyError(
                 f"{key}: boolean label can not be used without a boolean index"
             )
diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
index 3566ad5df226c..a5f143590ba03 100644
--- a/pandas/tests/base/test_value_counts.py
+++ b/pandas/tests/base/test_value_counts.py
@@ -85,8 +85,8 @@ def test_value_counts_null(null_obj, index_or_series_obj):
 
     # can't use expected[null_obj] = 3 as
     # IntervalIndex doesn't allow assignment
-    #new_entry = Series({np.nan: 3}, dtype=np.int64)
-    #expected = expected.append(new_entry)  # TODO: test that both of these work with IntegerNAIndex
+    # TODO: test that both expected.append(Series({np.nan: 3}, dtype=np.int64))
+    #  and expected[null_obj] = 3 work with IntegerNAIndex
     expected[null_obj] = 3
 
     result = obj.value_counts(dropna=False)

From 37b9370334437354ad2d99c0fe2cc489c30410b6 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 2 Nov 2021 20:43:16 -0700
Subject: [PATCH 27/57] avoid warnings

---
 pandas/_libs/index.pyx      |  3 +--
 pandas/core/indexes/base.py | 10 ++++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index 7499011e1060e..c289e21dba87e 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -1199,8 +1199,7 @@ cdef class NullableEngine:
 
     cdef _get_bool_indexer(self, val):
         if val is NA:
-            #if not self.has_missing:
-            #    raise KeyError(val)
+            # TODO: KeyError(val) if not has_missing?
             # TODO: readonly? copy?
             return self._mask.view("uint8")
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 9f2ca205d9efe..a8b3aaf27fec9 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -3466,7 +3466,7 @@ def symmetric_difference(self, other, result_name=None, sort=None):
         res_values = concat_compat([left_diff, right_diff])
         res_values = _maybe_try_sort(res_values, sort)
 
-        result = Index(res_values, name=result_name)
+        result = Index(res_values, name=result_name, dtype=res_values.dtype)
 
         if self._is_multi:
             self = cast("MultiIndex", self)
@@ -3490,7 +3490,13 @@ def _assert_can_do_setop(self, other) -> bool:
 
     def _convert_can_do_setop(self, other) -> tuple[Index, Hashable]:
         if not isinstance(other, Index):
-            other = Index(other, name=self.name)
+            # TODO(2.0): no need to special-case here once _with_infer
+            #  deprecation is enforced
+            if hasattr(other, "dtype"):
+                other = Index(other, name=self.name, dtype=other.dtype)
+            else:
+                # e.g. list
+                other = Index(other, name=self.name)
             result_name = self.name
         else:
             result_name = get_op_result_name(self, other)

From 0e56218fa88979723aaa32eed9e3749ae727bf12 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 4 Nov 2021 18:02:56 -0700
Subject: [PATCH 28/57] avoid FutureWarnings

---
 pandas/core/indexes/category.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index 005c5f75e6cfa..e037a7e0ed75f 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -462,7 +462,7 @@ def reindex(
         else:
             # e.g. test_reindex_with_categoricalindex, test_reindex_duplicate_target
             new_target = np.asarray(new_target)
-            new_target = Index(new_target, name=self.name)
+            new_target = Index._with_infer(new_target, name=self.name)
 
         return new_target, indexer
 

From e8987cd04e41e3fb8d748bda72278f4d8f6c0a53 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 4 Nov 2021 21:40:25 -0700
Subject: [PATCH 29/57] catch RuntimeWarning

---
 pandas/core/indexes/base.py         | 1 +
 pandas/tests/indexes/test_setops.py | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index a8b3aaf27fec9..69e4bd6ae06b2 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -5845,6 +5845,7 @@ def _maybe_promote(self, other: Index) -> tuple[Index, Index]:
             # TODO: we dont have tests that get here
             return type(other)(self), other
         elif self.inferred_type == "boolean":
+            return self, other
             if not is_object_dtype(self.dtype):
                 return self.astype("object"), other.astype("object")
 
diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py
index bec0a41b9bb6b..5e2c2a55fa713 100644
--- a/pandas/tests/indexes/test_setops.py
+++ b/pandas/tests/indexes/test_setops.py
@@ -208,7 +208,14 @@ def test_union_base(self, index):
         first = index[3:]
         second = index[:5]
         everything = index
-        union = first.union(second)
+
+        warn = None
+        if is_dtype_equal(index.dtype, "boolean") and index.isna().any():
+            warn = RuntimeWarning
+
+        msg = "boolean value of NA is ambiguous"
+        with tm.assert_produces_warning(warn, match=msg):
+            union = first.union(second)
         assert tm.equalContents(union, everything)
 
         if is_datetime64tz_dtype(index.dtype):

From fef88a7e252fc594b8a7ef4afd7ac04d9ebc240d Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 4 Nov 2021 21:43:04 -0700
Subject: [PATCH 30/57] remove unreachable

---
 pandas/core/indexes/base.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 69e4bd6ae06b2..7538017c29663 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -5844,10 +5844,6 @@ def _maybe_promote(self, other: Index) -> tuple[Index, Index]:
         elif self.inferred_type == "timedelta" and isinstance(other, ABCTimedeltaIndex):
             # TODO: we dont have tests that get here
             return type(other)(self), other
-        elif self.inferred_type == "boolean":
-            return self, other
-            if not is_object_dtype(self.dtype):
-                return self.astype("object"), other.astype("object")
 
         elif self.dtype.kind == "u" and other.dtype.kind == "i":
             # GH#41873

From 63f26ba3d8825e85531cbea6add1ea37023627c9 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 19 Nov 2021 13:14:51 -0800
Subject: [PATCH 31/57] revert no-longer-necessary

---
 pandas/core/arrays/floating.py     | 3 ---
 pandas/core/arrays/sparse/array.py | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py
index 1dee05954d8d5..e1eb5110ef5fe 100644
--- a/pandas/core/arrays/floating.py
+++ b/pandas/core/arrays/floating.py
@@ -429,9 +429,6 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
 
         return type(self)(result, mask, copy=False)
 
-    def isna(self):
-        return self._mask | np.isnan(self._data)
-
 
 _dtype_docstring = """
 An ExtensionDtype for {dtype} data.
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index 9eeeb93f673f2..c054710a01f75 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -954,10 +954,10 @@ def __getitem__(
                 # mypy doesn't know we have an array here
                 key = cast(np.ndarray, key)
                 return self.take(np.arange(len(key), dtype=np.int32)[key])
-            elif lib.is_list_like(key):
+            elif hasattr(key, "__len__"):
                 return self.take(key)
             else:
-                raise IndexError(f"Cannot slice with '{key}'")
+                raise ValueError(f"Cannot slice with '{key}'")
 
         return type(self)(data_slice, kind=self.kind)
 

From 11d35642c0333e69809bdba4175ab053bb0c8386 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 19 Nov 2021 18:50:11 -0800
Subject: [PATCH 32/57] Share ExtensionEngine/NullableEngine methods

---
 pandas/_libs/index.pyx             | 358 ++++++++---------------------
 pandas/core/arrays/string_arrow.py |   6 +-
 pandas/core/indexes/base.py        |  11 +-
 3 files changed, 105 insertions(+), 270 deletions(-)

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index c289e21dba87e..e6c0914e9e233 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -794,8 +794,9 @@ cdef class BaseMultiIndexCodesEngine:
 include "index_class_helper.pxi"
 
 
+@cython.internal
 @cython.freelist(32)
-cdef class ExtensionEngine:
+cdef class SharedEngine:
     cdef readonly:
         object values  # ExtensionArray
         bint over_size_threshold
@@ -804,13 +805,14 @@ cdef class ExtensionEngine:
         bint unique, monotonic_inc, monotonic_dec
         bint need_monotonic_check, need_unique_check
 
-    def __init__(self, values: "ExtensionArray"):
-        self.values = values
-
-        self.over_size_threshold = len(values) >= _SIZE_CUTOFF
-        self.need_unique_check = True
-        self.need_monotonic_check = True
-        self.need_unique_check = True
+    def __contains__(self, val: object) -> bool:
+        # We assume before we get here:
+        #  - val is hashable
+        try:
+            self.get_loc(val)
+            return True
+        except KeyError:
+            return False
 
     def clear_mapping(self):
         # for compat with IndexEngine
@@ -825,6 +827,9 @@ cdef class ExtensionEngine:
             self.need_unique_check = False
         return self.unique
 
+    cdef _do_monotonic_check(self):
+        raise NotImplementedError
+
     @property
     def is_monotonic_increasing(self) -> bool:
         if self.need_monotonic_check:
@@ -839,37 +844,19 @@ cdef class ExtensionEngine:
 
         return self.monotonic_dec == 1
 
-    cdef inline _do_monotonic_check(self):
-        cdef:
-            bint is_unique
-
-        # FIXME: shouldn't depend on non-required _values_for_argsort
-        try:
-            self.monotonic_inc, self.monotonic_dec, is_unique = \
-                self._call_monotonic(self.values._values_for_argsort())
-        except TypeError:
-            self.monotonic_inc = 0
-            self.monotonic_dec = 0
-            is_unique = 0
+    cdef _call_monotonic(self, values):
+        return algos.is_monotonic(values, timelike=False)
 
-        self.need_monotonic_check = 0
+    def sizeof(self, deep: bool = False) -> int:
+        """ return the sizeof our mapping """
+        return 0
 
-        # we can only be sure of uniqueness if is_unique=1
-        if is_unique:
-            self.unique = 1
-            self.need_unique_check = 0
+    def __sizeof__(self) -> int:
+        return self.sizeof()
 
-    cdef _call_monotonic(self, values):
-        return algos.is_monotonic(values, timelike=False)
+    cdef _check_type(self, object obj):
+        raise NotImplementedError
 
-    def __contains__(self, val: object) -> bool:
-        # We assume before we get here:
-        #  - val is hashable
-        try:
-            self.get_loc(val)
-            return True
-        except KeyError:
-            return False
 
     cpdef get_loc(self, object val):
         # -> Py_ssize_t | slice | ndarray[bool]
@@ -899,17 +886,6 @@ cdef class ExtensionEngine:
 
         return self._get_loc_duplicates(val)
 
-    cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
-        """
-        See ObjectEngine._searchsorted_left.__doc__.
-        """
-        try:
-            loc = self.values.searchsorted(val, side="left")
-        except TypeError as err:
-            # GH#35788 e.g. val=None with float64 values
-            raise KeyError(val)
-        return loc
-
     cdef inline _get_loc_duplicates(self, object val):
         # -> Py_ssize_t | slice | ndarray[bool]
         cdef:
@@ -934,22 +910,19 @@ cdef class ExtensionEngine:
 
         return self._maybe_get_bool_indexer(val)
 
-    cdef ndarray _get_bool_indexer(self, val):
-        if checknull(val):
-            return self.values.isna().view("uint8")
-
+    cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
+        """
+        See ObjectEngine._searchsorted_left.__doc__.
+        """
         try:
-            return self.values == val
-        except TypeError:
-            # e.g. if __eq__ returns a BooleanArray instead of ndarry[bool]
-            try:
-                return (self.values == val).to_numpy(dtype=bool, na_value=False)
-            except (TypeError, AttributeError) as err:
-                # e.g. (self.values == val) returned a bool
-                #  see test_get_loc_generator[string[pyarrow]]
-                # e.g. self.value == val raises TypeError bc generator has no len
-                #  see test_get_loc_generator[string[python]]
-                raise KeyError from err
+            loc = self.values.searchsorted(val, side="left")
+        except TypeError as err:
+            # GH#35788 e.g. val=None with float64 values
+            raise KeyError(val)
+        return loc
+
+    cdef ndarray _get_bool_indexer(self, val):
+        raise NotImplementedError
 
     cdef _maybe_get_bool_indexer(self, object val):
         # Returns ndarray[bool] or int
@@ -959,17 +932,8 @@ cdef class ExtensionEngine:
         indexer = self._get_bool_indexer(val)
         return _unpack_bool_indexer(indexer, val)
 
-    def sizeof(self, deep: bool = False) -> int:
-        """ return the sizeof our mapping """
-        return 0
-
-    def __sizeof__(self) -> int:
-        return self.sizeof()
-
-    cdef _check_type(self, object val):
-        hash(val)
-
-    def get_indexer(self, values: "ExtensionArray") -> np.ndarray:
+    def get_indexer(self, values) -> np.ndarray:
+        # values : type(self.values)
         # Note: we only get here with self.is_unique
         cdef:
             Py_ssize_t i, N = len(values)
@@ -989,13 +953,17 @@ cdef class ExtensionEngine:
 
         return res
 
-    def get_indexer_non_unique(self, targets: "ExtensionArray"):
+    def get_indexer_non_unique(self, targets):
         """
         Return an indexer suitable for taking from a non unique index
         return the labels in the same order as the target
         and a missing indexer into the targets (which correspond
         to the -1 indices in the results
 
+        Parameters
+        ----------
+        targets : type(self.values)
+
         Returns
         -------
         indexer : np.ndarray[np.intp]
@@ -1038,18 +1006,61 @@ cdef class ExtensionEngine:
         return indexer, missing
 
 
-@cython.freelist(32)
-cdef class NullableEngine:
+cdef class ExtensionEngine(SharedEngine):
+    def __init__(self, values: "ExtensionArray"):
+        self.values = values
+
+        self.over_size_threshold = len(values) >= _SIZE_CUTOFF
+        self.need_unique_check = True
+        self.need_monotonic_check = True
+        self.need_unique_check = True
+
+    cdef _do_monotonic_check(self):
+        cdef:
+            bint is_unique
+
+        # FIXME: shouldn't depend on non-required _values_for_argsort
+        try:
+            self.monotonic_inc, self.monotonic_dec, is_unique = \
+                self._call_monotonic(self.values._values_for_argsort())
+        except TypeError:
+            self.monotonic_inc = 0
+            self.monotonic_dec = 0
+            is_unique = 0
+
+        self.need_monotonic_check = 0
+
+        # we can only be sure of uniqueness if is_unique=1
+        if is_unique:
+            self.unique = 1
+            self.need_unique_check = 0
+
+    cdef ndarray _get_bool_indexer(self, val):
+        if checknull(val):
+            return self.values.isna().view("uint8")
+
+        try:
+            return self.values == val
+        except TypeError:
+            # e.g. if __eq__ returns a BooleanArray instead of ndarry[bool]
+            try:
+                return (self.values == val).to_numpy(dtype=bool, na_value=False)
+            except (TypeError, AttributeError) as err:
+                # e.g. (self.values == val) returned a bool
+                #  see test_get_loc_generator[string[pyarrow]]
+                # e.g. self.value == val raises TypeError bc generator has no len
+                #  see test_get_loc_generator[string[python]]
+                raise KeyError from err
+
+    cdef _check_type(self, object val):
+        hash(val)
+
+
+cdef class NullableEngine(SharedEngine):
 
     cdef readonly:
         ndarray _values, _mask
-        bint over_size_threshold
         bint has_missing
-        object values  # MaskedArray
-
-    cdef:
-        bint unique, monotonic_inc, monotonic_dec
-        bint need_monotonic_check, need_unique_check
 
     def __init__(self, values: "MaskedArray"):
         self.values = values
@@ -1062,34 +1073,7 @@ cdef class NullableEngine:
         self.over_size_threshold = len(values) >= _SIZE_CUTOFF
         self.need_unique_check = True
 
-    def clear_mapping(self):
-        # for compat with IndexEngine
-        pass
-
-    @property
-    def is_unique(self) -> bool:
-        if self.need_unique_check:
-            arr = self.values.unique()
-            self.unique = len(arr) == len(self.values)
-
-            self.need_unique_check = False
-        return self.unique
-
-    @property
-    def is_monotonic_increasing(self) -> bool:
-        if self.need_monotonic_check:
-            self._do_monotonic_check()
-
-        return self.monotonic_inc == 1
-
-    @property
-    def is_monotonic_decreasing(self) -> bool:
-        if self.need_monotonic_check:
-            self._do_monotonic_check()
-
-        return self.monotonic_dec == 1
-
-    cdef inline _do_monotonic_check(self):
+    cdef _do_monotonic_check(self):
         cdef:
             bint is_unique
 
@@ -1115,26 +1099,11 @@ cdef class NullableEngine:
             self.unique = 1
             self.need_unique_check = 0
 
-    cdef _call_monotonic(self, values):
-        return algos.is_monotonic(values, timelike=False)
-
-    def __contains__(self, val: object) -> bool:
-        # We assume before we get here:
-        #  - val is hashable
-        try:
-            self.get_loc(val)
-            return True
-        except KeyError:
-            return False
-
     cpdef get_loc(self, object val):
         # -> Py_ssize_t | slice | ndarray[bool]
         cdef:
             Py_ssize_t loc
 
-        if is_definitely_invalid_key(val):
-            raise TypeError(f"'{val}' is an invalid key")
-
         if val is NA:
             # TODO: return copy? readonly view?
             # TODO: do this later on to keep same pattern as IndexEngine?
@@ -1142,62 +1111,9 @@ cdef class NullableEngine:
                 raise KeyError(val)
             return _unpack_bool_indexer(self._mask, val)
 
-        self._check_type(val)
-
-        if self.over_size_threshold and self.is_monotonic_increasing:
-            if not self.is_unique:
-                return self._get_loc_duplicates(val)
-
-            values = self.values
-
-            loc = self._searchsorted_left(val)
-            if loc >= len(values):
-                raise KeyError(val)
-            if values[loc] != val:
-                raise KeyError(val)
-            return loc
-
-        if not self.unique:
-            return self._get_loc_duplicates(val)
-
-        return self._get_loc_duplicates(val)
-
-    cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
-        """
-        See ObjectEngine._searchsorted_left.__doc__.
-        """
-        try:
-            loc = self.values.searchsorted(val, side="left")
-        except TypeError as err:
-            # GH#35788 e.g. val=None with float64 values
-            raise KeyError(val)
-        return loc
-
-    cdef inline _get_loc_duplicates(self, object val):
-        # -> Py_ssize_t | slice | ndarray[bool]
-        cdef:
-            Py_ssize_t diff
-
-        if self.is_monotonic_increasing:
-            values = self.values
-            try:
-                left = values.searchsorted(val, side='left')
-                right = values.searchsorted(val, side='right')
-            except TypeError:
-                # e.g. GH#29189 get_loc(None) with a Float64Index
-                raise KeyError(val)
-
-            diff = right - left
-            if diff == 0:
-                raise KeyError(val)
-            elif diff == 1:
-                return left
-            else:
-                return slice(left, right)
-
-        return self._maybe_get_bool_indexer(val)
+        return SharedEngine.get_loc(self, val)
 
-    cdef _get_bool_indexer(self, val):
+    cdef ndarray _get_bool_indexer(self, val):
         if val is NA:
             # TODO: KeyError(val) if not has_missing?
             # TODO: readonly? copy?
@@ -1211,21 +1127,6 @@ cdef class NullableEngine:
         res[self._mask] = False
         return res
 
-    cdef _maybe_get_bool_indexer(self, object val):
-        # Returns ndarray[bool] or int
-        cdef:
-            ndarray[uint8_t, ndim=1, cast=True] indexer
-
-        indexer = self._get_bool_indexer(val)
-        return _unpack_bool_indexer(indexer, val)
-
-    def sizeof(self, deep: bool = False) -> int:
-        """ return the sizeof our mapping """
-        return 0
-
-    def __sizeof__(self) -> int:
-        return self.sizeof()
-
     cdef _check_type(self, object val):
         kind = self._values.dtype.kind
         if kind in ["i", "u"]:
@@ -1242,72 +1143,3 @@ cdef class NullableEngine:
             if not util.is_integer_object(val) and not util.is_float_object(val):
                 # in particular catch bool and avoid casting True -> 1.0
                 raise KeyError(val)
-
-    def get_indexer(self, values: "MaskedArray") -> np.ndarray:
-        # Note: we only get here with self.is_unique
-        cdef:
-            Py_ssize_t i, N = len(values)
-
-        res = np.empty(N, dtype=np.intp)
-
-        for i in range(N):
-            val = values[i]
-            try:
-                loc = self.get_loc(val)
-                # Because we are unique, loc should always be an integer
-            except KeyError:
-                loc = -1
-            else:
-                assert util.is_integer_object(loc), (loc, val)
-
-            res[i] = loc
-
-        return res
-
-    def get_indexer_non_unique(self, targets: "MaskedArray"):
-        """
-        Return an indexer suitable for taking from a non unique index
-        return the labels in the same order as the target
-        and a missing indexer into the targets (which correspond
-        to the -1 indices in the results
-
-        Returns
-        -------
-        indexer : np.ndarray[np.intp]
-        missing : np.ndarray[np.intp]
-        """
-        cdef:
-            Py_ssize_t i, N = len(targets)
-
-        indexer = []
-        missing = []
-
-        # See also IntervalIndex.get_indexer_pointwise
-        for i in range(N):
-            val = targets[i]
-
-            try:
-                locs = self.get_loc(val)
-            except KeyError:
-                locs = np.array([-1], dtype=np.intp)
-                missing.append(i)
-            else:
-                if isinstance(locs, slice):
-                    # Only needed for get_indexer_non_unique
-                    locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp)
-                elif util.is_integer_object(locs):
-                    locs = np.array([locs], dtype=np.intp)
-                else:
-                    assert locs.dtype.kind == "b"
-                    locs = locs.nonzero()[0]
-
-            indexer.append(locs)
-
-        try:
-            indexer = np.concatenate(indexer, dtype=np.intp)
-        except TypeError:
-            # numpy<1.20 doesn't accept dtype keyword
-            indexer = np.concatenate(indexer).astype(np.intp, copy=False)
-        missing = np.array(missing, dtype=np.intp)
-
-        return indexer, missing
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 20c34e811fd2a..272e51d2c1120 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -312,7 +312,11 @@ def __getitem__(
                 )
         elif isinstance(item, tuple):
             item = unpack_tuple_and_ellipses(item)
-        if item is Ellipsis:
+
+        # error: Non-overlapping identity check (left operand type:
+        # "Union[Union[int, integer[Any]], Union[slice, List[int],
+        # ndarray[Any, Any]]]", right operand type: "ellipsis")
+        if item is Ellipsis:  # type: ignore[comparison-overlap]
             # TODO: should be handled by pyarrow?
             item = slice(None)
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index a24eeed04e56c..3a802691e4c76 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -381,9 +381,7 @@ def _outer_indexer(
     # associated code in pandas 2.0.
     _is_backward_compat_public_numeric_index: bool = False
 
-    _engine_type: type[libindex.IndexEngine] | type[libindex.NullableEngine] | type[
-        libindex.ExtensionEngine
-    ] = libindex.ObjectEngine
+    _engine_type: type[libindex.IndexEngine] = libindex.ObjectEngine
     # whether we support partial string indexing. Overridden
     # in DatetimeIndex and PeriodIndex
     _supports_partial_string_indexing = False
@@ -666,7 +664,6 @@ def _with_infer(cls, *args, **kwargs):
         Constructor that uses the 1.0.x behavior inferring numeric dtypes
         for ndarray[object] inputs.
         """
-
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore", ".*the Index constructor", FutureWarning)
             result = cls(*args, **kwargs)
@@ -848,11 +845,10 @@ def _engine(
         ):
             return libindex.ExtensionEngine(self._values)
 
-        engine_type = cast("type[libindex.IndexEngine]", self._engine_type)
         # to avoid a reference cycle, bind `target_values` to a local variable, so
         # `self` is not passed into the lambda.
         target_values = self._get_engine_target()
-        return engine_type(target_values)
+        return self._engine_type(target_values)
 
     @final
     @cache_readonly
@@ -967,6 +963,9 @@ def view(self, cls=None):
 
                 arr = self._data.view("i8")
                 idx_cls = self._dtype_to_subclass(dtype)
+                # NB: we only get here for subclasses that override
+                #  _data_cls such that it is a type and not a tuple
+                #  of types.
                 arr_cls = idx_cls._data_cls
                 arr = arr_cls(self._data.view("i8"), dtype=dtype)
                 return idx_cls._simple_new(arr, name=self.name)

From 09d8bf1082b713cc23af5d4d755c8174313b84eb Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sat, 20 Nov 2021 20:05:02 -0800
Subject: [PATCH 33/57] lint fixup

---
 pandas/_libs/index.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index e6c0914e9e233..11be0afe4d86a 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -857,7 +857,6 @@ cdef class SharedEngine:
     cdef _check_type(self, object obj):
         raise NotImplementedError
 
-
     cpdef get_loc(self, object val):
         # -> Py_ssize_t | slice | ndarray[bool]
         cdef:

From 7d783b13ee3b8095944e9246d9a72baccae43852 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sat, 20 Nov 2021 20:38:07 -0800
Subject: [PATCH 34/57] revert no-longer-necessary

---
 pandas/_testing/asserters.py | 8 --------
 pandas/core/indexes/multi.py | 7 -------
 2 files changed, 15 deletions(-)

diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
index 354316af42b40..da8c0048decaa 100644
--- a/pandas/_testing/asserters.py
+++ b/pandas/_testing/asserters.py
@@ -43,7 +43,6 @@
     take_nd,
 )
 from pandas.core.arrays import (
-    BaseMaskedArray,
     DatetimeArray,
     ExtensionArray,
     IntervalArray,
@@ -405,13 +404,6 @@ def _get_ilevel_values(index, level):
         if not left.equals(right):
             mismatch = left._values != right._values
 
-            if isinstance(mismatch, BaseMaskedArray):
-                lvalues = cast(BaseMaskedArray, left._values)
-                rvalues = cast(BaseMaskedArray, right._values)
-                mismatch = mismatch.to_numpy(dtype=int, na_value=0)
-                mismask = lvalues._mask ^ rvalues._mask
-                mismatch[mismask] = 1
-
             diff = np.sum(mismatch.astype(int)) * 100.0 / len(left)
             msg = f"{obj} values are different ({np.round(diff, 5)} %)"
             raise_assert_detail(obj, msg, left, right)
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index e22b19e345fd9..128aa8e282a0d 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -284,13 +284,6 @@ class MultiIndex(Index):
     of the mentioned helper methods.
     """
 
-    # error: Incompatible types in assignment (expression has type
-    # "Type[BaseMultiIndexCodesEngine]", base class "Index" defined the type as
-    # "Union[Type[IndexEngine], Type[NullableEngine], Type[ExtensionEngine]]")
-    _engine_type: type[
-        libindex.BaseMultiIndexCodesEngine
-    ] = libindex.BaseMultiIndexCodesEngine  # type: ignore[assignment]
-
     _hidden_attrs = Index._hidden_attrs | frozenset()
 
     # initialize to zero-length tuples to make everything work

From de642497e6f1912f65a8f970d9e6d99274453349 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 22 Nov 2021 16:38:19 -0800
Subject: [PATCH 35/57] remove unnecessary from test_setops

---
 pandas/tests/indexes/test_setops.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py
index aa3c55cccfcef..d84ef44f236fc 100644
--- a/pandas/tests/indexes/test_setops.py
+++ b/pandas/tests/indexes/test_setops.py
@@ -209,13 +209,7 @@ def test_union_base(self, index):
         second = index[:5]
         everything = index
 
-        warn = None
-        if is_dtype_equal(index.dtype, "boolean") and index.isna().any():
-            warn = RuntimeWarning
-
-        msg = "boolean value of NA is ambiguous"
-        with tm.assert_produces_warning(warn, match=msg):
-            union = first.union(second)
+        union = first.union(second)
         assert tm.equalContents(union, everything)
 
         if is_datetime64tz_dtype(index.dtype):

From 1bb2901bbf70ae870bcba934133328e41aceb035 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 22 Nov 2021 16:40:03 -0800
Subject: [PATCH 36/57] suggested edits

---
 pandas/tests/base/test_misc.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index d4708cb4dcfcc..a5af02122097d 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -84,7 +84,7 @@ def test_memory_usage(index_or_series_obj):
     is_categorical = is_categorical_dtype(obj.dtype) or (
         isinstance(obj, Series) and is_categorical_dtype(obj.index.dtype)
     )
-    is_string = is_dtype_equal(obj, "string[python]") or (
+    is_object_string = is_dtype_equal(obj, "string[python]") or (
         is_ser and is_dtype_equal(obj.index.dtype, "string[python]")
     )
 
@@ -94,10 +94,9 @@ def test_memory_usage(index_or_series_obj):
         else:
             expected = 108 if IS64 else 64
         assert res_deep == res == expected
-    elif is_object or is_categorical or is_string:
+    elif is_object or is_categorical or is_object_string:
         # only deep will pick them up
         assert res_deep > res
-        assert res_deep > res
     else:
         assert res == res_deep
 

From 23bb32560ee866f0728f1c52713ba838af9e46dd Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 26 Nov 2021 12:00:51 +0100
Subject: [PATCH 37/57] actually run the new base extension tests for all EAs

---
 pandas/tests/extension/base/__init__.py       |  1 +
 pandas/tests/extension/base/ea_index.py       | 11 ----------
 pandas/tests/extension/base/index.py          | 20 +++++++++++++++++++
 .../tests/extension/decimal/test_decimal.py   |  4 ++++
 pandas/tests/extension/json/test_json.py      |  4 ++++
 pandas/tests/extension/test_boolean.py        |  4 ++++
 pandas/tests/extension/test_categorical.py    |  4 ++++
 pandas/tests/extension/test_datetime.py       |  4 ++++
 pandas/tests/extension/test_floating.py       |  4 ++++
 pandas/tests/extension/test_integer.py        |  4 ++++
 pandas/tests/extension/test_interval.py       |  4 ++++
 pandas/tests/extension/test_numpy.py          |  6 ++++++
 pandas/tests/extension/test_period.py         |  4 ++++
 pandas/tests/extension/test_sparse.py         | 13 ++++++++++++
 pandas/tests/extension/test_string.py         |  4 ++++
 15 files changed, 80 insertions(+), 11 deletions(-)
 delete mode 100644 pandas/tests/extension/base/ea_index.py
 create mode 100644 pandas/tests/extension/base/index.py

diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py
index 910b43a2cd148..876f595dcb03a 100644
--- a/pandas/tests/extension/base/__init__.py
+++ b/pandas/tests/extension/base/__init__.py
@@ -47,6 +47,7 @@ class TestMyDtype(BaseDtypeTests):
 from pandas.tests.extension.base.dtype import BaseDtypeTests  # noqa
 from pandas.tests.extension.base.getitem import BaseGetitemTests  # noqa
 from pandas.tests.extension.base.groupby import BaseGroupbyTests  # noqa
+from pandas.tests.extension.base.index import BaseIndexTests  # noqa
 from pandas.tests.extension.base.interface import BaseInterfaceTests  # noqa
 from pandas.tests.extension.base.io import BaseParsingTests  # noqa
 from pandas.tests.extension.base.methods import BaseMethodsTests  # noqa
diff --git a/pandas/tests/extension/base/ea_index.py b/pandas/tests/extension/base/ea_index.py
deleted file mode 100644
index 8309842f7134f..0000000000000
--- a/pandas/tests/extension/base/ea_index.py
+++ /dev/null
@@ -1,11 +0,0 @@
-"""
-Tests for Indexes backed by arbitrary ExtensionArrays.
-"""
-import pandas as pd
-from pandas.tests.extension.base.base import BaseExtensionTests
-
-
-class BaseExtensionIndexTests(BaseExtensionTests):
-    def test_index_from_array(self, data):
-        idx = pd.Index(data)
-        assert data.dtype == idx.dtype
diff --git a/pandas/tests/extension/base/index.py b/pandas/tests/extension/base/index.py
new file mode 100644
index 0000000000000..2539c38733a6c
--- /dev/null
+++ b/pandas/tests/extension/base/index.py
@@ -0,0 +1,20 @@
+"""
+Tests for Indexes backed by arbitrary ExtensionArrays.
+"""
+import pandas as pd
+from pandas.tests.extension.base.base import BaseExtensionTests
+
+
+class BaseIndexTests(BaseExtensionTests):
+    """Tests for Index object backed by an ExtensionArray"""
+
+    def test_index_from_array(self, data):
+        idx = pd.Index(data)
+        assert data.dtype == idx.dtype
+
+    def test_index_from_listlike_with_dtype(self, data):
+        idx = pd.Index(data, dtype=data.dtype)
+        assert idx.dtype == data.dtype
+
+        idx = pd.Index(list(data), dtype=data.dtype)
+        assert idx.dtype == data.dtype
diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py
index 53416b6a3e9db..a00860f4d02da 100644
--- a/pandas/tests/extension/decimal/test_decimal.py
+++ b/pandas/tests/extension/decimal/test_decimal.py
@@ -101,6 +101,10 @@ def test_take_na_value_other_decimal(self):
         self.assert_extension_array_equal(result, expected)
 
 
+class TestIndex(base.BaseIndexTests):
+    pass
+
+
 class TestMissing(base.BaseMissingTests):
     pass
 
diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
index d5ecd8909319e..d530a75b74c8f 100644
--- a/pandas/tests/extension/json/test_json.py
+++ b/pandas/tests/extension/json/test_json.py
@@ -196,6 +196,10 @@ class TestGetitem(BaseJSON, base.BaseGetitemTests):
     pass
 
 
+class TestIndex(BaseJSON, base.BaseIndexTests):
+    pass
+
+
 class TestMissing(BaseJSON, base.BaseMissingTests):
     @pytest.mark.skip(reason="Setting a dict as a scalar")
     def test_fillna_series(self):
diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py
index 05455905860d2..a4921b2f0bdf6 100644
--- a/pandas/tests/extension/test_boolean.py
+++ b/pandas/tests/extension/test_boolean.py
@@ -95,6 +95,10 @@ class TestSetitem(base.BaseSetitemTests):
     pass
 
 
+class TestIndex(base.BaseIndexTests):
+    pass
+
+
 class TestMissing(base.BaseMissingTests):
     pass
 
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
index 6a1a9512bc036..f92c7dde9eb92 100644
--- a/pandas/tests/extension/test_categorical.py
+++ b/pandas/tests/extension/test_categorical.py
@@ -144,6 +144,10 @@ class TestSetitem(base.BaseSetitemTests):
     pass
 
 
+class TestIndex(base.BaseIndexTests):
+    pass
+
+
 class TestMissing(base.BaseMissingTests):
     @pytest.mark.skip(reason="Not implemented")
     def test_fillna_limit_pad(self, data_missing):
diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py
index de5a6b7a5bb06..0c4759050c7c4 100644
--- a/pandas/tests/extension/test_datetime.py
+++ b/pandas/tests/extension/test_datetime.py
@@ -107,6 +107,10 @@ class TestGetitem(BaseDatetimeTests, base.BaseGetitemTests):
     pass
 
 
+class TestIndex(base.BaseIndexTests):
+    pass
+
+
 class TestMethods(BaseDatetimeTests, base.BaseMethodsTests):
     @pytest.mark.skip(reason="Incorrect expected")
     def test_value_counts(self, all_data, dropna):
diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py
index 2b08c5b7be450..a6f1592d55224 100644
--- a/pandas/tests/extension/test_floating.py
+++ b/pandas/tests/extension/test_floating.py
@@ -167,6 +167,10 @@ class TestSetitem(base.BaseSetitemTests):
     pass
 
 
+class TestIndex(base.BaseIndexTests):
+    pass
+
+
 class TestMissing(base.BaseMissingTests):
     pass
 
diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py
index 7d343aab3c7a0..4e64d8332ac63 100644
--- a/pandas/tests/extension/test_integer.py
+++ b/pandas/tests/extension/test_integer.py
@@ -190,6 +190,10 @@ class TestSetitem(base.BaseSetitemTests):
     pass
 
 
+class TestIndex(base.BaseIndexTests):
+    pass
+
+
 class TestMissing(base.BaseMissingTests):
     pass
 
diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py
index 24c0d619e2b1a..11014f245cdce 100644
--- a/pandas/tests/extension/test_interval.py
+++ b/pandas/tests/extension/test_interval.py
@@ -90,6 +90,10 @@ class TestGetitem(BaseInterval, base.BaseGetitemTests):
     pass
 
 
+class TestIndex(base.BaseIndexTests):
+    pass
+
+
 class TestGrouping(BaseInterval, base.BaseGroupbyTests):
     pass
 
diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py
index e60f7769270bd..3c359eac6deaf 100644
--- a/pandas/tests/extension/test_numpy.py
+++ b/pandas/tests/extension/test_numpy.py
@@ -219,6 +219,12 @@ def test_getitem_scalar(self, data):
         super().test_getitem_scalar(data)
 
 
+# TODO Index.__new__ checks for PandasArray (and converts it explicitly), so the
+# monkeypatching to make PandasArray into a regular ExtensionArray doesn't work
+# class TestIndex(base.BaseIndexTests):
+#     pass
+
+
 class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests):
     def test_groupby_extension_apply(
         self, data_for_grouping, groupby_apply_op, request
diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py
index f210a4ce56091..873fe9d6463ad 100644
--- a/pandas/tests/extension/test_period.py
+++ b/pandas/tests/extension/test_period.py
@@ -85,6 +85,10 @@ class TestGetitem(BasePeriodTests, base.BaseGetitemTests):
     pass
 
 
+class TestIndex(base.BaseIndexTests):
+    pass
+
+
 class TestMethods(BasePeriodTests, base.BaseMethodsTests):
     def test_combine_add(self, data_repeated):
         # Period + Period is not defined.
diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
index 012a3fbb12cac..fb3f95f88da71 100644
--- a/pandas/tests/extension/test_sparse.py
+++ b/pandas/tests/extension/test_sparse.py
@@ -194,6 +194,19 @@ def test_reindex(self, data, na_value):
 # Skipping TestSetitem, since we don't implement it.
 
 
+class TestIndex(base.BaseIndexTests):
+    def test_index_from_array(self, data):
+        idx = pd.Index(data)
+        # TODO do we want to preserve the sparse dtype in the index
+        # now this is possible?
+        assert idx.dtype == data.dtype.subtype
+
+    # TODO this is failing because it doesn't recognize the sparse dtype
+    @pytest.mark.xfail(reason="Index cannot yet store sparse dtype")
+    def test_index_from_listlike_with_dtype(self, data):
+        super().test_index_from_listlike_with_dtype(data)
+
+
 class TestMissing(BaseSparseTests, base.BaseMissingTests):
     def test_isna(self, data_missing):
         expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index af6c149447d15..827a410871329 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -136,6 +136,10 @@ def test_setitem_preserves_views(self, data, request):
         super().test_setitem_preserves_views(data)
 
 
+class TestIndex(base.BaseIndexTests):
+    pass
+
+
 class TestMissing(base.BaseMissingTests):
     pass
 

From 4abd60ea5a75a76b798e7fc186cf331e7139e6d4 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 29 Nov 2021 20:15:50 -0800
Subject: [PATCH 38/57] update tests

---
 pandas/tests/base/test_conversion.py       | 2 +-
 pandas/tests/indexes/test_numpy_compat.py  | 6 +++++-
 pandas/tests/reductions/test_reductions.py | 2 +-
 pandas/tests/strings/test_strings.py       | 3 ---
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py
index c483d4354a7b9..a4319f1f2b4e2 100644
--- a/pandas/tests/base/test_conversion.py
+++ b/pandas/tests/base/test_conversion.py
@@ -267,7 +267,7 @@ def test_numpy_array_all_dtypes(any_numpy_dtype):
 )
 def test_array(arr, attr, index_or_series, request):
     box = index_or_series
-    if arr.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index:
+    if arr.dtype.name in ("Sparse[int64, 0]") and box is pd.Index:
         mark = pytest.mark.xfail(reason="Needs EA-Backed Index")
         request.node.add_marker(mark)
 
diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py
index a619b6756f00a..8bca938218c0a 100644
--- a/pandas/tests/indexes/test_numpy_compat.py
+++ b/pandas/tests/indexes/test_numpy_compat.py
@@ -117,11 +117,15 @@ def test_numpy_ufuncs_other(index, func, request):
 
 
 @pytest.mark.parametrize("func", [np.maximum, np.minimum])
-def test_numpy_ufuncs_reductions(index, func):
+def test_numpy_ufuncs_reductions(index, func, request):
     # TODO: overlap with tests.series.test_ufunc.test_reductions
     if len(index) == 0:
         return
 
+    if repr(index.dtype) == "string[pyarrow]" or index.dtype == "boolean":
+        mark = pytest.mark.xfail(reason="ArrowStringArray/BooleanArray has no min/max")
+        request.node.add_marker(mark)
+
     if isinstance(index, CategoricalIndex) and index.dtype.ordered is False:
         with pytest.raises(TypeError, match="is not ordered for"):
             func.reduce(index)
diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
index cc7121d2f4656..2cc204102cb74 100644
--- a/pandas/tests/reductions/test_reductions.py
+++ b/pandas/tests/reductions/test_reductions.py
@@ -83,7 +83,7 @@ def test_nanminmax(self, opname, dtype, val, index_or_series, request):
         # GH#7261
         klass = index_or_series
 
-        if dtype in ["Int64", "boolean"] and klass == Index:
+        if dtype in ["boolean"] and klass == Index:
             mark = pytest.mark.xfail(reason="Need EA-backed Index")
             request.node.add_marker(mark)
 
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py
index 3e49d6367ffd9..461443385a74b 100644
--- a/pandas/tests/strings/test_strings.py
+++ b/pandas/tests/strings/test_strings.py
@@ -367,9 +367,6 @@ def test_len_mixed():
 def test_index(
     method, sub, start, end, index_or_series, any_string_dtype, expected, request
 ):
-    if index_or_series is Index and not any_string_dtype == "object":
-        mark = pytest.mark.xfail(reason="Need EA-backed Index")
-        request.node.add_marker(mark)
 
     obj = index_or_series(
         ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype

From 7f9741efdace376c38be7eb9837c2fbdc439507d Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 30 Nov 2021 08:23:56 -0800
Subject: [PATCH 39/57] older np compat

---
 pandas/tests/indexes/test_numpy_compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py
index 8bca938218c0a..58711ae0d8afc 100644
--- a/pandas/tests/indexes/test_numpy_compat.py
+++ b/pandas/tests/indexes/test_numpy_compat.py
@@ -122,7 +122,7 @@ def test_numpy_ufuncs_reductions(index, func, request):
     if len(index) == 0:
         return
 
-    if repr(index.dtype) == "string[pyarrow]" or index.dtype == "boolean":
+    if repr(index.dtype) == "string[pyarrow]" or str(index.dtype) == "boolean":
         mark = pytest.mark.xfail(reason="ArrowStringArray/BooleanArray has no min/max")
         request.node.add_marker(mark)
 

From 40e861b8d92aab7a33e8a014c757e6a8c70f1710 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 30 Nov 2021 16:42:51 -0800
Subject: [PATCH 40/57] 32bit compat

---
 pandas/tests/extension/test_sparse.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
index ea085f638342a..564f2cb2915a7 100644
--- a/pandas/tests/extension/test_sparse.py
+++ b/pandas/tests/extension/test_sparse.py
@@ -199,9 +199,14 @@ def test_index_from_array(self, data):
         idx = pd.Index(data)
         # TODO do we want to preserve the sparse dtype in the index
         # now this is possible?
+        if data.dtype.subtype == "f":
+            assert idx.dtype == np.float64
+        elif data.dtype.subtype == "i":
+            assert idx.dtype == np.int64
         assert idx.dtype == data.dtype.subtype
 
-    # TODO this is failing because it doesn't recognize the sparse dtype
+    # TODO(ExtensionIndex) this is failing because it doesn't recognize
+    #  the sparse dtype
     @pytest.mark.xfail(reason="Index cannot yet store sparse dtype")
     def test_index_from_listlike_with_dtype(self, data):
         super().test_index_from_listlike_with_dtype(data)

From 6e79350532732397d5ee3b1752dd6a859b29e7d7 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 30 Nov 2021 18:44:34 -0800
Subject: [PATCH 41/57] simplify, docstring

---
 pandas/_libs/lib.pyx | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index b6372543d2fc7..4d2b910071130 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -3041,22 +3041,23 @@ def is_bool_list(obj: list) -> bool:
 
 
 cpdef ndarray eq_NA_compat(ndarray[object] arr, object key):
+    """
+    Check for `arr == key`, treating all values as not-equal to pd.NA.
+
+    key is assumed to have `not isna(key)`
+    """
     cdef:
         ndarray[uint8_t, cast=True] result = np.empty(len(arr), dtype=bool)
         Py_ssize_t i
         object item
 
-    if key is C_NA:
-        for i in range(len(arr)):
-            item = arr[i]
-            result[i] = item is C_NA
-    else:
-        for i in range(len(arr)):
-            item = arr[i]
-            if item is C_NA:
-                result[i] = False
-            else:
-                result[i] = item == key  # FIXME: compat for other NAs
+    for i in range(len(arr)):
+        item = arr[i]
+        if item is C_NA:
+            result[i] = False
+        else:
+            result[i] = item == key
+
     return result
 
 

From 90366e9d3851d007c76b733a6f0da59080d484e0 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 30 Nov 2021 20:40:47 -0800
Subject: [PATCH 42/57] 32bit compat

---
 pandas/tests/extension/test_sparse.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
index 564f2cb2915a7..59e098ee59c40 100644
--- a/pandas/tests/extension/test_sparse.py
+++ b/pandas/tests/extension/test_sparse.py
@@ -203,7 +203,8 @@ def test_index_from_array(self, data):
             assert idx.dtype == np.float64
         elif data.dtype.subtype == "i":
             assert idx.dtype == np.int64
-        assert idx.dtype == data.dtype.subtype
+        else:
+            assert idx.dtype == data.dtype.subtype
 
     # TODO(ExtensionIndex) this is failing because it doesn't recognize
     #  the sparse dtype

From 70debb248a61e80d1d9540330065e0ef2e85fab4 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 1 Dec 2021 11:09:08 -0800
Subject: [PATCH 43/57] Address comments

---
 pandas/core/dtypes/common.py         | 3 ++-
 pandas/core/indexes/base.py          | 1 +
 pandas/tests/base/test_conversion.py | 2 +-
 pandas/tests/extension/test_numpy.py | 8 ++------
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 7ac8e6c47158c..e1770dc713c9a 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -1319,7 +1319,8 @@ def is_bool_dtype(arr_or_dtype) -> bool:
         # we don't have a boolean Index class
         # so its object, we need to infer to
         # guess this
-        return arr_or_dtype.is_object() and arr_or_dtype.inferred_type == "boolean"
+        # Allow Index[object] that is all-bools or Index["boolean"]
+        return arr_or_dtype.inferred_type == "boolean"
     elif is_extension_array_dtype(arr_or_dtype):
         return getattr(dtype, "_is_boolean", False)
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index cc6a6430d2fb5..0166a649892ad 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -3471,6 +3471,7 @@ def symmetric_difference(self, other, result_name=None, sort=None):
         res_values = concat_compat([left_diff, right_diff])
         res_values = _maybe_try_sort(res_values, sort)
 
+        # pass dtype so we retain object dtype
         result = Index(res_values, name=result_name, dtype=res_values.dtype)
 
         if self._is_multi:
diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py
index 97037b95e05b6..49afe2859098d 100644
--- a/pandas/tests/base/test_conversion.py
+++ b/pandas/tests/base/test_conversion.py
@@ -257,7 +257,7 @@ def test_numpy_array_all_dtypes(any_numpy_dtype):
 def test_array(arr, attr, index_or_series, request):
     box = index_or_series
     if arr.dtype.name in ("Sparse[int64, 0]") and box is pd.Index:
-        mark = pytest.mark.xfail(reason="Needs EA-Backed Index")
+        mark = pytest.mark.xfail(reason="Index cannot yet store sparse dtype")
         request.node.add_marker(mark)
 
     result = box(arr, copy=False).array
diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py
index 3c359eac6deaf..34003fd092c18 100644
--- a/pandas/tests/extension/test_numpy.py
+++ b/pandas/tests/extension/test_numpy.py
@@ -12,6 +12,8 @@
 classes (if they are relevant for the extension interface for all dtypes), or
 be added to the array-specific tests in `pandas/tests/arrays/`.
 
+Note: we do not bother with base.BaseIndexTests because PandasArray
+will never be held in an Index.
 """
 import numpy as np
 import pytest
@@ -219,12 +221,6 @@ def test_getitem_scalar(self, data):
         super().test_getitem_scalar(data)
 
 
-# TODO Index.__new__ checks for PandasArray (and converts it explicitly), so the
-# monkeypatching to make PandasArray into a regular ExtensionArray doesn't work
-# class TestIndex(base.BaseIndexTests):
-#     pass
-
-
 class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests):
     def test_groupby_extension_apply(
         self, data_for_grouping, groupby_apply_op, request

From 0339e69f27b0e09b6bc142a41f546196c5be39b7 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sat, 4 Dec 2021 18:34:02 -0800
Subject: [PATCH 44/57] simplify

---
 pandas/tests/indexes/test_base.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index b383381d9a5c5..7d62b2f6f2247 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -530,21 +530,20 @@ def test_map_dictlike(self, index, mapper):
             # Cannot map duplicated index
             return
 
+        rng = np.arange(len(index), 0, -1)
+
         if index.empty:
             # to match proper result coercion for uints
             expected = Index([])
         elif index._is_backward_compat_public_numeric_index:
-            expected = index._constructor(
-                np.arange(len(index), 0, -1), dtype=index.dtype
-            )
+            expected = index._constructor(rng, dtype=index.dtype)
         elif type(index) is Index and index.dtype != object:
             # i.e. EA-backed, for now just Nullable
-            expected = Index(np.arange(len(index), 0, -1), dtype=index.dtype)
+            expected = Index(rng, dtype=index.dtype)
         elif index.dtype.kind == "u":
-            # TODO: case where e.g. we cannot hold result in UInt8?
-            expected = Index(np.arange(len(index), 0, -1), dtype=index.dtype)
+            expected = Index(rng, dtype=index.dtype)
         else:
-            expected = Index(np.arange(len(index), 0, -1))
+            expected = Index(rng)
 
         result = index.map(mapper(expected, index))
         tm.assert_index_equal(result, expected)

From 96b25aab8cfab27891adc3af735b323a75ec5af7 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sun, 5 Dec 2021 14:31:06 -0800
Subject: [PATCH 45/57] dont catch np.float16 too early

---
 pandas/core/arrays/floating.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py
index f127b1db4c553..ef812b0823ca9 100644
--- a/pandas/core/arrays/floating.py
+++ b/pandas/core/arrays/floating.py
@@ -251,12 +251,7 @@ def dtype(self) -> FloatingDtype:
         return FLOAT_STR_TO_DTYPE[str(self._data.dtype)]
 
     def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
-        if not (
-            isinstance(values, np.ndarray)
-            and values.dtype.kind == "f"
-            and values.dtype.itemsize > 2
-        ):
-            # We do not support float16
+        if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"):
             raise TypeError(
                 "values should be floating numpy array. Use "
                 "the 'pd.array' function instead"

From 812da93a0807e522a35091c4be1248b7eb8a3353 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 7 Dec 2021 11:39:26 -0800
Subject: [PATCH 46/57] de-xfail

---
 pandas/tests/indexes/test_numpy_compat.py  | 4 ++--
 pandas/tests/reductions/test_reductions.py | 6 +-----
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py
index 58711ae0d8afc..03bdc13d1ea8b 100644
--- a/pandas/tests/indexes/test_numpy_compat.py
+++ b/pandas/tests/indexes/test_numpy_compat.py
@@ -122,8 +122,8 @@ def test_numpy_ufuncs_reductions(index, func, request):
     if len(index) == 0:
         return
 
-    if repr(index.dtype) == "string[pyarrow]" or str(index.dtype) == "boolean":
-        mark = pytest.mark.xfail(reason="ArrowStringArray/BooleanArray has no min/max")
+    if repr(index.dtype) == "string[pyarrow]":
+        mark = pytest.mark.xfail(reason="ArrowStringArray has no min/max")
         request.node.add_marker(mark)
 
     if isinstance(index, CategoricalIndex) and index.dtype.ordered is False:
diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
index 004c087044017..70e739d1440d6 100644
--- a/pandas/tests/reductions/test_reductions.py
+++ b/pandas/tests/reductions/test_reductions.py
@@ -79,14 +79,10 @@ def test_ops(self, opname, obj):
             ("boolean", True),
         ],
     )
-    def test_nanminmax(self, opname, dtype, val, index_or_series, request):
+    def test_nanminmax(self, opname, dtype, val, index_or_series):
         # GH#7261
         klass = index_or_series
 
-        if dtype in ["boolean"] and klass == Index:
-            mark = pytest.mark.xfail(reason="Need EA-backed Index")
-            request.node.add_marker(mark)
-
         def check_missing(res):
             if dtype == "datetime64[ns]":
                 return res is NaT

From e3943942f6e5aec539df01341d42188860cb7044 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 7 Dec 2021 14:54:52 -0800
Subject: [PATCH 47/57] remove edits made extraneous by other PRs

---
 pandas/core/arrays/floating.py | 2 --
 pandas/core/arrays/masked.py   | 4 ----
 2 files changed, 6 deletions(-)

diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py
index 8611f8c8af0e3..4c868747fa930 100644
--- a/pandas/core/arrays/floating.py
+++ b/pandas/core/arrays/floating.py
@@ -103,8 +103,6 @@ def coerce_to_array(
     if dtype is None and hasattr(values, "dtype"):
         if is_float_dtype(values.dtype):
             dtype = values.dtype
-            if dtype == "float16":
-                raise TypeError("FloatingArray does not support float16 dtype")
 
     if dtype is not None:
         if isinstance(dtype, str) and dtype.startswith("Float"):
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 8e02e21f59f42..1c9c2a4780722 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -468,10 +468,6 @@ def reconstruct(x):
                 m = mask.copy()
                 return IntegerArray(x, m)
             elif is_float_dtype(x.dtype):
-                if x.dtype.itemsize <= 2:
-                    # reached in e.g. np.sqrt on BooleanArray
-                    # we don't support float16
-                    x = x.astype(np.float32)
                 m = mask.copy()
                 if x.dtype == np.float16:
                     # reached in e.g. np.sqrt on BooleanArray

From 3e1ec002e99af65aff18e41c88026cb506d6b768 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 13 Dec 2021 08:48:23 -0800
Subject: [PATCH 48/57] suggested edits

---
 pandas/core/dtypes/common.py           | 5 -----
 pandas/tests/base/test_value_counts.py | 7 ++-----
 pandas/tests/indexes/common.py         | 7 +------
 3 files changed, 3 insertions(+), 16 deletions(-)

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 01737591c2faa..72df30a4c96aa 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -1323,11 +1323,6 @@ def is_bool_dtype(arr_or_dtype) -> bool:
         # now we use the special definition for Index
 
     if isinstance(arr_or_dtype, ABCIndex):
-
-        # TODO(jreback)
-        # we don't have a boolean Index class
-        # so its object, we need to infer to
-        # guess this
         # Allow Index[object] that is all-bools or Index["boolean"]
         return arr_or_dtype.inferred_type == "boolean"
     elif is_extension_array_dtype(arr_or_dtype):
diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
index f9ba713d20dfe..6a479ee347f16 100644
--- a/pandas/tests/base/test_value_counts.py
+++ b/pandas/tests/base/test_value_counts.py
@@ -32,16 +32,14 @@ def test_value_counts(index_or_series_obj):
         expected.index = Index(expected.index)
 
     if not isinstance(result.dtype, np.dtype):
-        # TODO: be more specific
         # i.e IntegerDtype
-        expected = expected.astype(result.dtype)
+        expected = expected.astype("Int64")
 
     # TODO(GH#32514): Order of entries with the same count is inconsistent
     #  on CI (gh-32449)
     if obj.duplicated().any():
         result = result.sort_index()
         expected = expected.sort_index()
-
     tm.assert_series_equal(result, expected)
 
 
@@ -78,9 +76,8 @@ def test_value_counts_null(null_obj, index_or_series_obj):
         result = result.sort_index()
 
     if not isinstance(result.dtype, np.dtype):
-        # TODO: be more specific
         # i.e IntegerDtype
-        expected = expected.astype(result.dtype)
+        expected = expected.astype("Int64")
     tm.assert_series_equal(result, expected)
 
     expected[null_obj] = 3
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index 7a031e4319654..699ef88bf419f 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -252,12 +252,7 @@ def test_ensure_copied_data(self, index):
                     index._values._ndarray, result._values._ndarray, check_same="same"
                 )
             elif index.dtype == "string[pyarrow]":
-                # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
-                result_pa_data = result._values._data
-                index_pa_data = index._values._data
-                res_buf1 = result_pa_data.chunk(0).buffers()[1]
-                idx_buf1 = index_pa_data.chunk(0).buffers()[1]
-                assert res_buf1.address == idx_buf1.address
+                assert tm.shares_memory(result._values, index._values)
             else:
                 raise NotImplementedError(index.dtype)
         else:

From 267b1b33634f81e882372c6476a159acf55c3586 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 15 Dec 2021 15:03:05 -0800
Subject: [PATCH 49/57] Remove NullableEngine, ExtensionEngine

---
 pandas/_libs/index.pyi      |  36 ----
 pandas/_libs/index.pyx      | 350 ------------------------------------
 pandas/core/indexes/base.py |  20 ++-
 3 files changed, 14 insertions(+), 392 deletions(-)

diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi
index 21f1c3e147a59..68163c1284b30 100644
--- a/pandas/_libs/index.pyi
+++ b/pandas/_libs/index.pyi
@@ -71,39 +71,3 @@ class BaseMultiIndexCodesEngine:
         method: str,
         limit: int | None,
     ) -> npt.NDArray[np.intp]: ...
-
-class NullableEngine:
-    def __init__(self, values: "BaseMaskedArray"): ...
-    def __contains__(self, val: object) -> bool: ...
-    def get_loc(self, val: object) -> int | slice | np.ndarray: ...
-    def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ...
-    def get_indexer_non_unique(
-        self,
-        targets: np.ndarray,
-    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
-    @property
-    def is_unique(self) -> bool: ...
-    @property
-    def is_monotonic_increasing(self) -> bool: ...
-    @property
-    def is_monotonic_decreasing(self) -> bool: ...
-    def sizeof(self, deep: bool = ...) -> int: ...
-    def clear_mapping(self): ...
-
-class ExtensionEngine:
-    def __init__(self, values: "ExtensionArray"): ...
-    def __contains__(self, val: object) -> bool: ...
-    def get_loc(self, val: object) -> int | slice | np.ndarray: ...
-    def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ...
-    def get_indexer_non_unique(
-        self,
-        targets: np.ndarray,
-    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
-    @property
-    def is_unique(self) -> bool: ...
-    @property
-    def is_monotonic_increasing(self) -> bool: ...
-    @property
-    def is_monotonic_decreasing(self) -> bool: ...
-    def sizeof(self, deep: bool = ...) -> int: ...
-    def clear_mapping(self): ...
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index 37ac02a2bd507..c3b86165e6d2c 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -797,353 +797,3 @@ cdef class BaseMultiIndexCodesEngine:
 
 # Generated from template.
 include "index_class_helper.pxi"
-
-
-@cython.internal
-@cython.freelist(32)
-cdef class SharedEngine:
-    cdef readonly:
-        object values  # ExtensionArray
-        bint over_size_threshold
-
-    cdef:
-        bint unique, monotonic_inc, monotonic_dec
-        bint need_monotonic_check, need_unique_check
-
-    def __contains__(self, val: object) -> bool:
-        # We assume before we get here:
-        #  - val is hashable
-        try:
-            self.get_loc(val)
-            return True
-        except KeyError:
-            return False
-
-    def clear_mapping(self):
-        # for compat with IndexEngine
-        pass
-
-    @property
-    def is_unique(self) -> bool:
-        if self.need_unique_check:
-            arr = self.values.unique()
-            self.unique = len(arr) == len(self.values)
-
-            self.need_unique_check = False
-        return self.unique
-
-    cdef _do_monotonic_check(self):
-        raise NotImplementedError
-
-    @property
-    def is_monotonic_increasing(self) -> bool:
-        if self.need_monotonic_check:
-            self._do_monotonic_check()
-
-        return self.monotonic_inc == 1
-
-    @property
-    def is_monotonic_decreasing(self) -> bool:
-        if self.need_monotonic_check:
-            self._do_monotonic_check()
-
-        return self.monotonic_dec == 1
-
-    cdef _call_monotonic(self, values):
-        return algos.is_monotonic(values, timelike=False)
-
-    def sizeof(self, deep: bool = False) -> int:
-        """ return the sizeof our mapping """
-        return 0
-
-    def __sizeof__(self) -> int:
-        return self.sizeof()
-
-    cdef _check_type(self, object obj):
-        raise NotImplementedError
-
-    cpdef get_loc(self, object val):
-        # -> Py_ssize_t | slice | ndarray[bool]
-        cdef:
-            Py_ssize_t loc
-
-        if is_definitely_invalid_key(val):
-            raise TypeError(f"'{val}' is an invalid key")
-
-        self._check_type(val)
-
-        if self.over_size_threshold and self.is_monotonic_increasing:
-            if not self.is_unique:
-                return self._get_loc_duplicates(val)
-
-            values = self.values
-
-            loc = self._searchsorted_left(val)
-            if loc >= len(values):
-                raise KeyError(val)
-            if values[loc] != val:
-                raise KeyError(val)
-            return loc
-
-        if not self.unique:
-            return self._get_loc_duplicates(val)
-
-        return self._get_loc_duplicates(val)
-
-    cdef inline _get_loc_duplicates(self, object val):
-        # -> Py_ssize_t | slice | ndarray[bool]
-        cdef:
-            Py_ssize_t diff
-
-        if self.is_monotonic_increasing:
-            values = self.values
-            try:
-                left = values.searchsorted(val, side='left')
-                right = values.searchsorted(val, side='right')
-            except TypeError:
-                # e.g. GH#29189 get_loc(None) with a Float64Index
-                raise KeyError(val)
-
-            diff = right - left
-            if diff == 0:
-                raise KeyError(val)
-            elif diff == 1:
-                return left
-            else:
-                return slice(left, right)
-
-        return self._maybe_get_bool_indexer(val)
-
-    cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
-        """
-        See ObjectEngine._searchsorted_left.__doc__.
-        """
-        try:
-            loc = self.values.searchsorted(val, side="left")
-        except TypeError as err:
-            # GH#35788 e.g. val=None with float64 values
-            raise KeyError(val)
-        return loc
-
-    cdef ndarray _get_bool_indexer(self, val):
-        raise NotImplementedError
-
-    cdef _maybe_get_bool_indexer(self, object val):
-        # Returns ndarray[bool] or int
-        cdef:
-            ndarray[uint8_t, ndim=1, cast=True] indexer
-
-        indexer = self._get_bool_indexer(val)
-        return _unpack_bool_indexer(indexer, val)
-
-    def get_indexer(self, values) -> np.ndarray:
-        # values : type(self.values)
-        # Note: we only get here with self.is_unique
-        cdef:
-            Py_ssize_t i, N = len(values)
-
-        res = np.empty(N, dtype=np.intp)
-
-        for i in range(N):
-            val = values[i]
-            try:
-                loc = self.get_loc(val)
-                # Because we are unique, loc should always be an integer
-            except KeyError:
-                loc = -1
-            else:
-                assert util.is_integer_object(loc), (loc, val)
-            res[i] = loc
-
-        return res
-
-    def get_indexer_non_unique(self, targets):
-        """
-        Return an indexer suitable for taking from a non unique index
-        return the labels in the same order as the target
-        and a missing indexer into the targets (which correspond
-        to the -1 indices in the results
-
-        Parameters
-        ----------
-        targets : type(self.values)
-
-        Returns
-        -------
-        indexer : np.ndarray[np.intp]
-        missing : np.ndarray[np.intp]
-        """
-        cdef:
-            Py_ssize_t i, N = len(targets)
-
-        indexer = []
-        missing = []
-
-        # See also IntervalIndex.get_indexer_pointwise
-        for i in range(N):
-            val = targets[i]
-
-            try:
-                locs = self.get_loc(val)
-            except KeyError:
-                locs = np.array([-1], dtype=np.intp)
-                missing.append(i)
-            else:
-                if isinstance(locs, slice):
-                    # Only needed for get_indexer_non_unique
-                    locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp)
-                elif util.is_integer_object(locs):
-                    locs = np.array([locs], dtype=np.intp)
-                else:
-                    assert locs.dtype.kind == "b"
-                    locs = locs.nonzero()[0]
-
-            indexer.append(locs)
-
-        try:
-            indexer = np.concatenate(indexer, dtype=np.intp)
-        except TypeError:
-            # numpy<1.20 doesn't accept dtype keyword
-            indexer = np.concatenate(indexer).astype(np.intp, copy=False)
-        missing = np.array(missing, dtype=np.intp)
-
-        return indexer, missing
-
-
-cdef class ExtensionEngine(SharedEngine):
-    def __init__(self, values: "ExtensionArray"):
-        self.values = values
-
-        self.over_size_threshold = len(values) >= _SIZE_CUTOFF
-        self.need_unique_check = True
-        self.need_monotonic_check = True
-        self.need_unique_check = True
-
-    cdef _do_monotonic_check(self):
-        cdef:
-            bint is_unique
-
-        # FIXME: shouldn't depend on non-required _values_for_argsort
-        try:
-            self.monotonic_inc, self.monotonic_dec, is_unique = \
-                self._call_monotonic(self.values._values_for_argsort())
-        except TypeError:
-            self.monotonic_inc = 0
-            self.monotonic_dec = 0
-            is_unique = 0
-
-        self.need_monotonic_check = 0
-
-        # we can only be sure of uniqueness if is_unique=1
-        if is_unique:
-            self.unique = 1
-            self.need_unique_check = 0
-
-    cdef ndarray _get_bool_indexer(self, val):
-        if checknull(val):
-            return self.values.isna().view("uint8")
-
-        try:
-            return self.values == val
-        except TypeError:
-            # e.g. if __eq__ returns a BooleanArray instead of ndarry[bool]
-            try:
-                return (self.values == val).to_numpy(dtype=bool, na_value=False)
-            except (TypeError, AttributeError) as err:
-                # e.g. (self.values == val) returned a bool
-                #  see test_get_loc_generator[string[pyarrow]]
-                # e.g. self.value == val raises TypeError bc generator has no len
-                #  see test_get_loc_generator[string[python]]
-                raise KeyError from err
-
-    cdef _check_type(self, object val):
-        hash(val)
-
-
-cdef class NullableEngine(SharedEngine):
-
-    cdef readonly:
-        ndarray _values, _mask
-        bint has_missing
-
-    def __init__(self, values: "MaskedArray"):
-        self.values = values
-
-        self._values = values._data
-        self._mask = values._mask
-
-        self.has_missing = values._mask.any()
-
-        self.over_size_threshold = len(values) >= _SIZE_CUTOFF
-        self.need_unique_check = True
-
-    cdef _do_monotonic_check(self):
-        cdef:
-            bint is_unique
-
-        if self.has_missing:
-            self.monotonic_inc = 0
-            self.monotonic_dec = 0
-            self.need_monotonic_check = 0
-            return
-
-        # If there are no missing, then we can just look at self._values
-        try:
-            self.monotonic_inc, self.monotonic_dec, is_unique = \
-                self._call_monotonic(self._values)
-        except TypeError:
-            self.monotonic_inc = 0
-            self.monotonic_dec = 0
-            is_unique = 0
-
-        self.need_monotonic_check = 0
-
-        # we can only be sure of uniqueness if is_unique=1
-        if is_unique:
-            self.unique = 1
-            self.need_unique_check = 0
-
-    cpdef get_loc(self, object val):
-        # -> Py_ssize_t | slice | ndarray[bool]
-        cdef:
-            Py_ssize_t loc
-
-        if val is NA:
-            # TODO: return copy? readonly view?
-            # TODO: do this later on to keep same pattern as IndexEngine?
-            if not self.has_missing:
-                raise KeyError(val)
-            return _unpack_bool_indexer(self._mask, val)
-
-        return SharedEngine.get_loc(self, val)
-
-    cdef ndarray _get_bool_indexer(self, val):
-        if val is NA:
-            # TODO: KeyError(val) if not has_missing?
-            # TODO: readonly? copy?
-            return self._mask.view("uint8")
-
-        if util.is_nan(val):
-            res = np.isnan(self._values)
-        else:
-            res = self._values == val
-
-        res[self._mask] = False
-        return res
-
-    cdef _check_type(self, object val):
-        kind = self._values.dtype.kind
-        if kind in ["i", "u"]:
-            if not util.is_integer_object(val):
-                raise KeyError(val)
-            if kind == "u":
-                if val < 0:
-                    # cannot have negative values with unsigned int dtype
-                    raise KeyError(val)
-        elif kind == "b":
-            if not util.is_bool_object(val):
-                raise KeyError(val)
-        else:
-            if not util.is_integer_object(val) and not util.is_float_object(val):
-                # in particular catch bool and avoid casting True -> 1.0
-                raise KeyError(val)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 0fa17e4f69d9a..f66489d467032 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -834,16 +834,18 @@ def _cleanup(self) -> None:
     @cache_readonly
     def _engine(
         self,
-    ) -> libindex.IndexEngine | libindex.NullableEngine | libindex.ExtensionEngine:
+    ) -> libindex.IndexEngine:
         # For base class (object dtype) we get ObjectEngine
 
         if isinstance(self._values, BaseMaskedArray):
-            return libindex.NullableEngine(self._values)
+            # TODO(ExtensionIndex): use libindex.NullableEngine(self._values)
+            return libindex.ObjectEngine(self._get_engine_target())
         elif (
             isinstance(self._values, ExtensionArray)
             and self._engine_type is libindex.ObjectEngine
         ):
-            return libindex.ExtensionEngine(self._values)
+            # TODO(ExtensionIndex): use libindex.ExtensionEngine(self._values)
+            return libindex.ObjectEngine(self._get_engine_target())
 
         # to avoid a reference cycle, bind `target_values` to a local variable, so
         # `self` is not passed into the lambda.
@@ -3209,10 +3211,13 @@ def _wrap_setop_result(self, other: Index, result) -> Index:
         name = get_op_result_name(self, other)
         if isinstance(result, Index):
             if result.name != name:
-                return result.rename(name)
-            return result
+                result = result.rename(name)
         else:
-            return self._shallow_copy(result, name=name)
+            result = self._shallow_copy(result, name=name)
+
+        # TODO(ExtensionIndex): revert this astype; it is a kludge to make
+        #  it possible to split ExtensionEngine from ExtensionIndex PR.
+        return result.astype(self.dtype, copy=False)
 
     # TODO: standardize return type of non-union setops type(self vs other)
     @final
@@ -4782,6 +4787,9 @@ def _get_engine_target(self) -> np.ndarray:
         """
         # error: Incompatible return value type (got "Union[ExtensionArray,
         # ndarray]", expected "ndarray")
+        if type(self) is Index and isinstance(self._values, ExtensionArray):
+            # TODO(ExtensionIndex): remove special-case, just use self._values
+            return self._values.astype(object)
         return self._values  # type: ignore[return-value]
 
     def _from_join_target(self, result: np.ndarray) -> ArrayLike:

From c8072c5281a09728334ebd2648d4c673067b34fe Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 21 Dec 2021 15:39:23 -0800
Subject: [PATCH 50/57] revert

---
 pandas/_libs/index.pyi | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi
index 68163c1284b30..446a980487cde 100644
--- a/pandas/_libs/index.pyi
+++ b/pandas/_libs/index.pyi
@@ -1,17 +1,9 @@
-from typing import TYPE_CHECKING
-
 import numpy as np
 
 from pandas._typing import npt
 
 from pandas import MultiIndex
 
-if TYPE_CHECKING:
-    from pandas.core.arrays import (
-        BaseMaskedArray,
-        ExtensionArray,
-    )
-
 class IndexEngine:
     over_size_threshold: bool
     def __init__(self, values: np.ndarray): ...

From 7e0ac18849ce1cf74c5902773f0a76527952a024 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 21 Dec 2021 15:44:36 -0800
Subject: [PATCH 51/57] remove no-longer-necessary

---
 pandas/core/indexes/base.py    | 3 +--
 pandas/tests/base/test_misc.py | 8 +-------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 15cafce537e17..aebe2f1b36d26 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -6050,8 +6050,7 @@ def map(self, mapper, na_action=None):
                 new_values, dtype=dtype, copy=False, name=self.name
             )
 
-        result = Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name)
-        return result
+        return Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name)
 
     # TODO: De-duplicate with map, xref GH#32349
     @final
diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index 26d6ed172e78d..0c448ae3ee57c 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -156,13 +156,7 @@ def test_access_by_position(index_flat):
     assert index[-1] == index[size - 1]
 
     msg = f"index {size} is out of bounds for axis 0 with size {size}"
-    try:
-        eq = is_dtype_equal(index.dtype, "string[pyarrow]")
-    except ImportError:
-        # TODO(GH#44276) is_dtype_equal can raise here
-        eq = False
-
-    if eq:
+    if is_dtype_equal(index.dtype, "string[pyarrow]"):
         msg = "index out of bounds"
     with pytest.raises(IndexError, match=msg):
         index[size]

From 80453b45cddecd3577d28b188386b66b07650355 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 21 Dec 2021 15:52:53 -0800
Subject: [PATCH 52/57] whatsnew

---
 doc/source/whatsnew/v1.4.0.rst | 37 ++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index 2592be9c4a350..df847fdcb08b0 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -90,6 +90,43 @@ be removed in the future, see :ref:`here <whatsnew_140.deprecations.int64_uint64
 
 See :ref:`here <advanced.numericindex>` for more about :class:`NumericIndex`.
 
+
+.. _whatsnew_140.enhancements.ExtensionIndex:
+
+Index can hold arbitrary ExtensionArrays
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Until now, passing a custom :class:`ExtensionArray` to ``pd.Index`` would cast the
+array to ``object`` dtype. Now :class:`Index` can directly hold arbitrary ExtensionArrays (:issue:`43930`).
+
+*Previous behavior*:
+
+.. ipython:: python
+
+   arr = pd.array([1, 2, pd.NA])
+   idx = pd.Index(arr)
+
+In the old behavior, ``idx`` would be object-dtype:
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+   In [1]: idx
+   Out[1]: Index([1, 2, <NA>], dtype='object')
+
+With the new behavior, we keep the original dtype:
+
+*New behavior*:
+
+.. ipython:: python
+
+   idx
+
+One exception to this is ``SparseArray``, which will continue to cast to numpy
+dtype until pandas 2.0. At that point it will retain its dtype like other
+ExtensionArrays.
+
 .. _whatsnew_140.enhancements.styler:
 
 Styler

From 7231a9e71d6b29f47f913ea0ce5c4de9f452dec7 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 21 Dec 2021 16:02:43 -0800
Subject: [PATCH 53/57] deprecation for SparseArray

---
 pandas/core/indexes/base.py                       |  8 ++++++++
 pandas/tests/extension/test_sparse.py             | 15 ++++++++++++---
 .../tests/indexes/datetimes/test_constructors.py  |  4 +++-
 3 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index aebe2f1b36d26..825f80590c0ce 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -570,6 +570,14 @@ def _dtype_to_subclass(cls, dtype: DtypeObj):
                 return PeriodIndex
 
             elif isinstance(dtype, SparseDtype):
+                warnings.warn(
+                    "In a future version, passing a SparseArray to pd.Index "
+                    "will store that array directly instead of converting to a "
+                    "dense numpy ndarray. To retain the old behavior, use "
+                    "pd.Index(arr.to_numpy()) instead",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
                 return cls._dtype_to_subclass(dtype.subtype)
 
             return Index
diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
index d22c70e86ff74..ea715f4fac6db 100644
--- a/pandas/tests/extension/test_sparse.py
+++ b/pandas/tests/extension/test_sparse.py
@@ -193,9 +193,10 @@ def test_reindex(self, data, na_value):
 
 class TestIndex(base.BaseIndexTests):
     def test_index_from_array(self, data):
-        idx = pd.Index(data)
-        # TODO do we want to preserve the sparse dtype in the index
-        # now this is possible?
+        msg = "will store that array directly"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            idx = pd.Index(data)
+
         if data.dtype.subtype == "f":
             assert idx.dtype == np.float64
         elif data.dtype.subtype == "i":
@@ -274,6 +275,14 @@ def test_fillna_frame(self, data_missing):
 
 
 class TestMethods(BaseSparseTests, base.BaseMethodsTests):
+    @pytest.mark.parametrize("ascending", [True, False])
+    def test_sort_values_frame(self, data_for_sorting, ascending):
+        msg = "will store that array directly"
+        with tm.assert_produces_warning(
+            FutureWarning, match=msg, check_stacklevel=False
+        ):
+            super().test_sort_values_frame(data_for_sorting, ascending)
+
     def test_combine_le(self, data_repeated):
         # We return a Series[SparseArray].__le__ returns a
         # Series[Sparse[bool]]
diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py
index b27c5852cb97b..b1e764ceb7009 100644
--- a/pandas/tests/indexes/datetimes/test_constructors.py
+++ b/pandas/tests/indexes/datetimes/test_constructors.py
@@ -123,7 +123,9 @@ def test_constructor_from_sparse_array(self):
             Timestamp("2016-05-01T01:00:00.000000"),
         ]
         arr = pd.arrays.SparseArray(values)
-        result = Index(arr)
+        msg = "will store that array directly"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            result = Index(arr)
         expected = DatetimeIndex(values)
         tm.assert_index_equal(result, expected)
 

From f78aa0f8f69f96e6da769ac1a2e3962462208d05 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 21 Dec 2021 16:28:28 -0800
Subject: [PATCH 54/57] share _na_value method

---
 pandas/core/indexes/base.py                    | 8 ++++++--
 pandas/core/indexes/datetimelike.py            | 4 ----
 pandas/tests/indexes/datetimelike_/test_nat.py | 1 -
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 825f80590c0ce..5698456c22f01 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -23,6 +23,7 @@
 from pandas._config import get_option
 
 from pandas._libs import (
+    NaT,
     algos as libalgos,
     index as libindex,
     lib,
@@ -2616,9 +2617,12 @@ def __reduce__(self):
     @cache_readonly
     def _na_value(self):
         """The expected NA value to use with this index."""
-        if isinstance(self.dtype, np.dtype):
+        dtype = self.dtype
+        if isinstance(dtype, np.dtype):
+            if dtype.kind in ["m", "M"]:
+                return NaT
             return np.nan
-        return self.dtype.na_value
+        return dtype.na_value
 
     @cache_readonly
     def _isnan(self) -> npt.NDArray[np.bool_]:
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
index 731efdc3b17f0..589b92f392ca8 100644
--- a/pandas/core/indexes/datetimelike.py
+++ b/pandas/core/indexes/datetimelike.py
@@ -24,7 +24,6 @@
 )
 from pandas._libs.tslibs import (
     BaseOffset,
-    NaTType,
     Resolution,
     Tick,
     parsing,
@@ -154,9 +153,6 @@ def __contains__(self, key: Any) -> bool:
 
     _can_hold_na = True
 
-    _na_value: NaTType = NaT
-    """The expected NA value to use with this index."""
-
     def _convert_tolerance(self, tolerance, target):
         tolerance = np.asarray(to_timedelta(tolerance).to_numpy())
         return super()._convert_tolerance(tolerance, target)
diff --git a/pandas/tests/indexes/datetimelike_/test_nat.py b/pandas/tests/indexes/datetimelike_/test_nat.py
index b4a72ec65bd91..50cf29d016355 100644
--- a/pandas/tests/indexes/datetimelike_/test_nat.py
+++ b/pandas/tests/indexes/datetimelike_/test_nat.py
@@ -17,7 +17,6 @@ def test_nat(self, index_without_na):
         index_with_na = index_without_na.copy(deep=True)
         index_with_na._data[1] = NaT
 
-        assert type(index_without_na)._na_value is NaT
         assert empty_index._na_value is NaT
         assert index_with_na._na_value is NaT
         assert index_without_na._na_value is NaT

From 453d6aec93bdc35494472aa802804c108826b606 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 22 Dec 2021 09:51:02 -0800
Subject: [PATCH 55/57] mypy fixup, npdev catch warnings

---
 pandas/core/indexes/base.py          |  8 +++++---
 pandas/tests/base/test_conversion.py | 12 ++++++++++--
 pandas/tests/series/test_ufunc.py    | 11 +++++++++--
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 5698456c22f01..bbcc40fa78d49 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -3768,7 +3768,7 @@ def _get_indexer(
             if target._is_multi and self._is_multi:
                 engine = self._engine
                 # error: "IndexEngine" has no attribute "_extract_level_codes"
-                tgt_values = engine._extract_level_codes(  # type: ignore[union-attr]
+                tgt_values = engine._extract_level_codes(  # type: ignore[attr-defined]
                     target
                 )
 
@@ -3848,7 +3848,7 @@ def _get_fill_indexer(
             # TODO: get_indexer_with_fill docstring says values must be _sorted_
             #  but that doesn't appear to be enforced
             # error: "IndexEngine" has no attribute "get_indexer_with_fill"
-            return self._engine.get_indexer_with_fill(  # type: ignore[union-attr]
+            return self._engine.get_indexer_with_fill(  # type: ignore[attr-defined]
                 target=target._values, values=self._values, method=method, limit=limit
             )
 
@@ -5692,7 +5692,9 @@ def get_indexer_non_unique(
         if self._is_multi and target._is_multi:
             engine = self._engine
             # error: "IndexEngine" has no attribute "_extract_level_codes"
-            tgt_values = engine._extract_level_codes(target)  # type: ignore[union-attr]
+            tgt_values = engine._extract_level_codes(
+                target  # type: ignore[attr-defined]
+            )
 
         indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
         return ensure_platform_int(indexer), ensure_platform_int(missing)
diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py
index 49afe2859098d..84e4992cce0e3 100644
--- a/pandas/tests/base/test_conversion.py
+++ b/pandas/tests/base/test_conversion.py
@@ -256,11 +256,14 @@ def test_numpy_array_all_dtypes(any_numpy_dtype):
 )
 def test_array(arr, attr, index_or_series, request):
     box = index_or_series
+    warn = None
     if arr.dtype.name in ("Sparse[int64, 0]") and box is pd.Index:
         mark = pytest.mark.xfail(reason="Index cannot yet store sparse dtype")
         request.node.add_marker(mark)
+        warn = FutureWarning
 
-    result = box(arr, copy=False).array
+    with tm.assert_produces_warning(warn):
+        result = box(arr, copy=False).array
 
     if attr:
         arr = getattr(arr, attr)
@@ -330,7 +333,12 @@ def test_array_multiindex_raises():
 )
 def test_to_numpy(arr, expected, index_or_series_or_array, request):
     box = index_or_series_or_array
-    thing = box(arr)
+
+    warn = None
+    if index_or_series_or_array is pd.Index and isinstance(arr, SparseArray):
+        warn = FutureWarning
+    with tm.assert_produces_warning(warn):
+        thing = box(arr)
 
     if arr.dtype.name == "int64" and box is pd.array:
         mark = pytest.mark.xfail(reason="thing is Int64 and to_numpy() returns object")
diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py
index 23c432b2d10bf..d4e528789dedb 100644
--- a/pandas/tests/series/test_ufunc.py
+++ b/pandas/tests/series/test_ufunc.py
@@ -85,7 +85,10 @@ def test_binary_ufunc_with_index(flip, sparse, ufunc, arrays_for_binary_ufunc):
 
     name = "name"  # op(pd.Series, array) preserves the name.
     series = pd.Series(a1, name=name)
-    other = pd.Index(a2, name=name).astype("int64")
+
+    warn = None if not sparse else FutureWarning
+    with tm.assert_produces_warning(warn):
+        other = pd.Index(a2, name=name).astype("int64")
 
     array_args = (a1, a2)
     series_args = (series, other)  # ufunc(series, array)
@@ -275,7 +278,11 @@ def test_reduce(values, box, request):
             # ATM Index casts to object, so we get python ints/floats
             same_type = False
 
-    obj = box(values)
+    warn = None
+    if values.dtype == "Sparse[int]" and box is pd.Index:
+        warn = FutureWarning
+    with tm.assert_produces_warning(warn):
+        obj = box(values)
 
     result = np.maximum.reduce(obj)
     expected = values[1]

From 8750248c9bf967620d434f57c1a232069ba3d5c1 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 22 Dec 2021 10:37:51 -0800
Subject: [PATCH 56/57] mypy fixup

---
 pandas/core/indexes/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index bbcc40fa78d49..dff6b58437be3 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -5692,8 +5692,8 @@ def get_indexer_non_unique(
         if self._is_multi and target._is_multi:
             engine = self._engine
             # error: "IndexEngine" has no attribute "_extract_level_codes"
-            tgt_values = engine._extract_level_codes(
-                target  # type: ignore[attr-defined]
+            tgt_values = engine._extract_level_codes(  # type: ignore[attr-defined]
+                target
             )
 
         indexer, missing = self._engine.get_indexer_non_unique(tgt_values)

From d2e0266363ad2e5ffe89670ad19c60beca0f5e37 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 22 Dec 2021 17:00:44 -0800
Subject: [PATCH 57/57] compat for older numpy

---
 pandas/tests/series/test_ufunc.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py
index d4e528789dedb..dd0d999d438fa 100644
--- a/pandas/tests/series/test_ufunc.py
+++ b/pandas/tests/series/test_ufunc.py
@@ -4,6 +4,8 @@
 import numpy as np
 import pytest
 
+from pandas.core.dtypes.common import is_dtype_equal
+
 import pandas as pd
 import pandas._testing as tm
 from pandas.arrays import SparseArray
@@ -279,7 +281,7 @@ def test_reduce(values, box, request):
             same_type = False
 
     warn = None
-    if values.dtype == "Sparse[int]" and box is pd.Index:
+    if is_dtype_equal(values.dtype, "Sparse[int]") and box is pd.Index:
         warn = FutureWarning
     with tm.assert_produces_warning(warn):
         obj = box(values)