pandas-dev · jreback · Dec 31, 2021 · Oct 8, 2021 · Oct 8, 2021 · Oct 8, 2021
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
diff --git a/pandas/_libs/lib.pxd b/pandas/_libs/lib.pxd
@@ -1 +1,5 @@
+from numpy cimport ndarray
+
 cdef bint c_is_list_like(object, bint) except -1
+
+cpdef ndarray eq_NA_compat(ndarray[object] arr, object key)
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -3028,3 +3028,23 @@ def is_bool_list(obj: list) -> bool:
 
     # Note: we return True for empty list
     return True
+
+
+cpdef ndarray eq_NA_compat(ndarray[object] arr, object key):
+    cdef:
+        ndarray[uint8_t, cast=True] result = np.empty(len(arr), dtype=bool)
+        Py_ssize_t i
+        object item
+
+    if key is C_NA:
+        for i in range(len(arr)):
+            item = arr[i]
+            result[i] = item is C_NA
+    else:
+        for i in range(len(arr)):
+            item = arr[i]
+            if item is C_NA:
+                result[i] = False
+            else:
+                result[i] = item == key  # FIXME: compat for other NAs
+    return result
diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx
@@ -13,6 +13,7 @@ from pandas._libs.util cimport (
     is_real_number_object,
 )
 
+from pandas._libs.missing cimport is_matching_na
 from pandas.core.dtypes.common import is_dtype_equal
 from pandas.core.dtypes.missing import (
     array_equivalent,
@@ -174,11 +175,18 @@ cpdef assert_almost_equal(a, b,
         # classes can't be the same, to raise error
         assert_class_equal(a, b, obj=obj)
 
-    if isna(a) and isna(b):
-        # TODO: Should require same-dtype NA?
-        # nan / None comparison
-        return True
+    if isna(a):
+        if isna(b):
+            # TODO: Should require same-dtype NA?
+            # nan / None comparison
+            return True
+
+        assert False, f"expected {a} but got {b}"
+
+    elif isna(b):
+        assert False, f"expected {a} but got {b}"
 
+    # TODO: test for tm.assert_whatever with pd.NA that would raise here
     if a == b:
         # object comparison
         return True

diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -401,8 +401,16 @@ def _get_ilevel_values(index, level):
     # skip exact index checking when `check_categorical` is False
     if check_exact and check_categorical:
         if not left.equals(right):
+            mismatch = left._values != right._values
+
+            if not isinstance(mismatch, np.ndarray):
+                # i.e. its a MaskedArray
+                mismatch = mismatch.to_numpy(dtype=int, na_value=0)
+                mismask = left._values._mask ^ right._values._mask
+                mismatch[mismask] = 1
+
             diff = (
-                np.sum((left._values != right._values).astype(int)) * 100.0 / len(left)
+                np.sum(mismatch.astype(int)) * 100.0 / len(left)
             )
             msg = f"{obj} values are different ({np.round(diff, 5)} %)"
             raise_assert_detail(obj, msg, left, right)

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -498,6 +498,14 @@ def _create_mi_with_dt64tz_level():
     "mi-with-dt64tz-level": _create_mi_with_dt64tz_level(),
     "multi": _create_multiindex(),
     "repeats": Index([0, 0, 1, 1, 2, 2]),
+    "nullable_int": Index(np.arange(100), dtype="Int64"),
+    "nullable_float": Index(np.arange(100), dtype="Float32"),
+    "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"),
+    #"nullable_int-na": Index(np.arange(100), dtype="Int64").insert(1, pd.NA),
+    #"nullable_float-na": Index(np.arange(100), dtype="Float32").insert(1, pd.NA),
+    #"nullable_bool-na": Index(np.arange(100).astype(bool), dtype="boolean").insert(
+    #    1, pd.NA
+    #),
 }
 
 

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1349,6 +1349,25 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
     # ------------------------------------------------------------------------
     # Non-Optimized Default Methods
 
+    def putmask(self, mask: np.ndarray, value) -> None:
+        """
+        Analogue to np.putmask(self, mask, value)
+
+        Parameters
+        ----------
+        mask : np.ndarray[bool]
+        value : scalar or listlike
+
+        Raises
+        ------
+        TypeError
+            If value cannot be inserted into self.
+        """
+        if not is_list_like(value):
+            self[mask] = value
+        else:
+            self[mask] = value[mask]
+
     def tolist(self) -> list:
         """
         Return a list of the values.

diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py
@@ -108,6 +108,8 @@ def coerce_to_array(
     if dtype is None and hasattr(values, "dtype"):
         if is_float_dtype(values.dtype):
             dtype = values.dtype
+            if dtype == "float16":
+                raise TypeError("FloatingArray does not support float16 dtype")
 
     if dtype is not None:
         if isinstance(dtype, str) and dtype.startswith("Float"):
@@ -254,7 +256,8 @@ def dtype(self) -> FloatingDtype:
         return FLOAT_STR_TO_DTYPE[str(self._data.dtype)]
 
     def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
-        if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"):
+        if not (isinstance(values, np.ndarray) and values.dtype.kind == "f" and values.dtype.itemsize > 2):
+            # We do not support float16
             raise TypeError(
                 "values should be floating numpy array. Use "
                 "the 'pd.array' function instead"
@@ -422,6 +425,8 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
 
         return type(self)(result, mask, copy=False)
 
+    def isna(self):
+        return self._mask | np.isnan(self._data)
 
 _dtype_docstring = """
 An ExtensionDtype for {dtype} data.

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -412,6 +412,9 @@ def reconstruct(x):
                 m = mask.copy()
                 return IntegerArray(x, m)
             elif is_float_dtype(x.dtype):
+                if x.dtype.itemsize <= 2:
+                    # we don't support float16
+                    x = x.astype(np.float32)
                 m = mask.copy()
                 return FloatingArray(x, m)
             else:
@@ -564,7 +567,7 @@ def value_counts(self, dropna: bool = True) -> Series:
         # TODO(extension)
         # if we have allow Index to hold an ExtensionArray
         # this is easier
-        index = value_counts.index._values.astype(object)
+        index = value_counts.index  # ._values.astype(object)
 
         # if we want nans, count the mask
         if dropna:
@@ -574,10 +577,13 @@ def value_counts(self, dropna: bool = True) -> Series:
             counts[:-1] = value_counts
             counts[-1] = self._mask.sum()
 
-            index = Index(
-                np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]),
-                dtype=object,
-            )
+            index = index.insert(-1, self.dtype.na_value)
+            # index = Index(
+            #    np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]),
+            #    dtype=object,
+            # )
+
+        index = index.astype(self.dtype)
 
         mask = np.zeros(len(counts), dtype="bool")
         counts = IntegerArray(counts, mask)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -26,6 +26,7 @@
     algos as libalgos,
     index as libindex,
     lib,
+    missing as libmissing,
 )
 import pandas._libs.join as libjoin
 from pandas._libs.lib import (
@@ -135,6 +136,7 @@
     tz_to_dtype,
     validate_tz_from_dtype,
 )
+from pandas.core.arrays.masked import BaseMaskedArray
 from pandas.core.arrays.sparse import SparseDtype
 from pandas.core.base import (
     IndexOpsMixin,
@@ -356,7 +358,7 @@ def _outer_indexer(
 
     _typ: str = "index"
     _data: ExtensionArray | np.ndarray
-    _data_cls: type[np.ndarray] | type[ExtensionArray] = np.ndarray
+    _data_cls: type[np.ndarray] | type[ExtensionArray] = (np.ndarray, ExtensionArray)
     _id: object | None = None
     _name: Hashable = None
     # MultiIndex.levels previously allowed setting the index name. We
@@ -411,8 +413,9 @@ def __new__(
             validate_tz_from_dtype(dtype, tz)
             dtype = tz_to_dtype(tz)
 
-        if isinstance(data, PandasArray):
-            # ensure users don't accidentally put a PandasArray in an index.
+        if type(data) is PandasArray:
+            # ensure users don't accidentally put a PandasArray in an index,
+            #  but don't unpack StringArray
             data = data.to_numpy()
         if isinstance(dtype, PandasDtype):
             dtype = dtype.numpy_dtype
@@ -434,7 +437,6 @@ def __new__(
 
             ea_cls = dtype.construct_array_type()
             data = ea_cls._from_sequence(data, dtype=dtype, copy=copy)
-            data = np.asarray(data, dtype=object)
             disallow_kwargs(kwargs)
             return Index._simple_new(data, name=name)
 
@@ -446,8 +448,8 @@ def __new__(
                     return result.astype(dtype, copy=False)
                 return result
 
-            data = np.array(data, dtype=object, copy=copy)
             disallow_kwargs(kwargs)
+            data = extract_array(data, extract_numpy=True)
             return Index._simple_new(data, name=name)
 
         # index-like
@@ -657,6 +659,7 @@ def _with_infer(cls, *args, **kwargs):
         Constructor that uses the 1.0.x behavior inferring numeric dtypes
         for ndarray[object] inputs.
         """
+
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore", ".*the Index constructor", FutureWarning)
             result = cls(*args, **kwargs)
@@ -812,6 +815,15 @@ def _cleanup(self) -> None:
     def _engine(self) -> libindex.IndexEngine:
         # For base class (object dtype) we get ObjectEngine
 
+        if isinstance(self._values, BaseMaskedArray):
+            return libindex.NullableEngine(self._values)
+        elif (
+            isinstance(self._values, ExtensionArray)
+            and self._engine_type is libindex.ObjectEngine
+        ):
+            return libindex.ExtensionEngine(self._values)
+
+        assert self.dtype != "boolean"
         # to avoid a reference cycle, bind `target_values` to a local variable, so
         # `self` is not passed into the lambda.
         target_values = self._get_engine_target()
@@ -1025,9 +1037,15 @@ def take(
 
         # Note: we discard fill_value and use self._na_value, only relevant
         #  in the case where allow_fill is True and fill_value is not None
-        taken = algos.take(
-            self._values, indices, allow_fill=allow_fill, fill_value=self._na_value
-        )
+        values = self._values
+        if isinstance(values, np.ndarray):
+            taken = algos.take(
+                values, indices, allow_fill=allow_fill, fill_value=self._na_value
+            )
+        else:
+            taken = values.take(
+                indices, allow_fill=allow_fill, fill_value=self._na_value
+            )
         # _constructor so RangeIndex->Int64Index
         return self._constructor._simple_new(taken, name=self.name)
 
@@ -3572,6 +3590,7 @@ def get_indexer(
 
             indexer = self._engine.get_indexer(target.codes)
             if self.hasnans and target.hasnans:
+                #loc = self.get_loc(libmissing.NA)
                 loc = self.get_loc(np.nan)
                 mask = target.isna()
                 indexer[mask] = loc
@@ -3590,6 +3609,7 @@ def get_indexer(
                 # Exclude MultiIndex because hasnans raises NotImplementedError
                 # we should only get here if we are unique, so loc is an integer
                 # GH#41934
+                #loc = self.get_loc(libmissing.NA)
                 loc = self.get_loc(np.nan)
                 mask = target.isna()
                 indexer[mask] = loc
@@ -6353,6 +6373,18 @@ def insert(self, loc: int, item) -> Index:
 
         arr = self._values
 
+        if isinstance(arr, ExtensionArray):
+            # TODO: need EA.insert
+            try:
+                arr2 = type(arr)._from_sequence([item], dtype=arr.dtype)
+            except TypeError:
+                # TODO: make this into _validate_fill_value
+                dtype = self._find_common_type_compat(item)
+                return self.astype(dtype).insert(loc, item)
+
+            res_values = arr._concat_same_type([arr[:loc], arr2, arr[loc:]])
+            return type(self)._simple_new(res_values, name=self.name)
+
         if arr.dtype != object or not isinstance(
             item, (tuple, np.datetime64, np.timedelta64)
         ):

diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
@@ -725,7 +725,12 @@ def _get_indexer_pointwise(
                 if isinstance(locs, slice):
                     # Only needed for get_indexer_non_unique
                     locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp")
-                locs = np.array(locs, ndmin=1)
+                elif lib.is_integer(locs):
+                    locs = np.array(locs, ndmin=1)
+                else:
+                    # FIXME: This is wrong; its boolean; not reached
+                    assert locs.dtype.kind == "i"
+
             except KeyError:
                 missing.append(i)
                 locs = np.array([-1])

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -964,7 +964,7 @@ def _validate_key(self, key, axis: int):
         # slice of labels (where start-end in labels)
         # slice of integers (only if in the labels)
         # boolean not in slice and with boolean index
-        if isinstance(key, bool) and not is_bool_dtype(self.obj.index):
+        if isinstance(key, bool) and not (is_bool_dtype(self.obj.index) or self.obj.index.dtype.name == "boolean"):
             raise KeyError(
                 f"{key}: boolean label can not be used without a boolean index"
             )

diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py
@@ -1403,14 +1403,23 @@ def test_integer_array_add_list_like(
     left = container + box_1d_array(data)
     right = box_1d_array(data) + container
 
-    if Series == box_pandas_1d_array:
-        expected = Series(expected_data, dtype="Int64")
-    elif Series == box_1d_array:
-        expected = Series(expected_data, dtype="object")
-    elif Index in (box_pandas_1d_array, box_1d_array):
-        expected = Int64Index(expected_data)
+    if Series in [box_1d_array, box_pandas_1d_array]:
+        cls = Series
+    elif Index in [box_1d_array, box_pandas_1d_array]:
+        cls = Index
     else:
-        expected = np.array(expected_data, dtype="object")
+        cls = np.array
+
+    if box_pandas_1d_array in [Index, Series]:
+        expected = cls(expected_data, dtype="Int64")
+
+    elif box_1d_array == Index:
+        # tm.to_array casts to object, Index constructor does inference
+        expected = cls(expected_data, dtype="int64")
+
+    else:
+        # tm.to_array casts to object, no inference
+        expected = cls(expected_data, dtype="object")
 
     tm.assert_equal(left, expected)
     tm.assert_equal(right, expected)

diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py
@@ -77,18 +77,21 @@ def test_ufunc_reduce_raises(values):
 def test_value_counts_na():
     arr = pd.array([True, False, pd.NA], dtype="boolean")
     result = arr.value_counts(dropna=False)
-    expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64")
+    expected = pd.Series([1, 1, 1], index=arr, dtype="Int64")
+    assert expected.index.dtype == arr.dtype
     tm.assert_series_equal(result, expected)
 
     result = arr.value_counts(dropna=True)
-    expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
+    expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64")
+    assert expected.index.dtype == arr.dtype
     tm.assert_series_equal(result, expected)
 
 
 def test_value_counts_with_normalize():
-    s = pd.Series([True, False, pd.NA], dtype="boolean")
-    result = s.value_counts(normalize=True)
-    expected = pd.Series([1, 1], index=[True, False], dtype="Float64") / 2
+    ser = pd.Series([True, False, pd.NA], dtype="boolean")
+    result = ser.value_counts(normalize=True)
+    expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64") / 2
+    assert expected.index.dtype == "boolean"
     tm.assert_series_equal(result, expected)