From 045391c81217d11555e493f4ebb5fb6f4dbcc61a Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Fri, 13 Dec 2019 14:40:35 -0600
Subject: [PATCH] [WIP]: Indexing with BooleanArray

This implements indexing a Series with a BooleanArray for some
dtypes. NA values in the mask propagate.

This required updating each EA's `__getitem__` to correctly propagate
NA. This isn't an option for ndarray-backed Series, which don't
understand BooleanArray.

No need to review in detail. The primary motivation for pushing this up
is to better understand what indexing with NA values will look like in
practice.
---
 pandas/core/arrays/boolean.py                |  9 +++
 pandas/core/arrays/categorical.py            | 11 +++-
 pandas/core/arrays/datetimelike.py           | 14 ++++-
 pandas/core/arrays/integer.py                | 11 ++++
 pandas/core/arrays/string_.py                | 16 +++++-
 pandas/core/common.py                        |  6 +-
 pandas/core/indexes/base.py                  |  8 ++-
 pandas/core/indexing.py                      |  8 ++-
 pandas/core/internals/blocks.py              |  6 +-
 pandas/core/internals/managers.py            |  8 ++-
 pandas/core/series.py                        | 17 +++---
 pandas/tests/frame/indexing/test_indexing.py |  2 +
 pandas/tests/indexing/test_na_indexing.py    | 58 ++++++++++++++++++++
 13 files changed, 150 insertions(+), 24 deletions(-)
 create mode 100644 pandas/tests/indexing/test_na_indexing.py

diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
index 743d45e1fa400..66c32e0a1b9c6 100644
--- a/pandas/core/arrays/boolean.py
+++ b/pandas/core/arrays/boolean.py
@@ -24,6 +24,7 @@
 )
 from pandas.core.dtypes.dtypes import register_extension_dtype
 from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
+from pandas.core.dtypes.inference import is_array_like
 from pandas.core.dtypes.missing import isna, notna
 
 from pandas.core import nanops, ops
@@ -294,6 +295,14 @@ def __getitem__(self, item):
             if self._mask[item]:
                 return self.dtype.na_value
             return self._data[item]
+        elif is_array_like(item) and is_bool_dtype(item.dtype):
+            if isinstance(item, BooleanArray):
+                # items, mask = item._data, item._mask
+                take = item._data | item._mask
+                result = self._data[take]
+                # output masked anywhere where self is masked, or the input was masked
+                omask = (self._mask | item._mask)[take]
+                return type(self)(result, omask)
         return type(self)(self._data[item], self._mask[item])
 
     def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 71fe944827a74..1e4f618ba70b2 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -46,6 +46,7 @@
 from pandas.core.accessor import PandasDelegate, delegate_names
 import pandas.core.algorithms as algorithms
 from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d
+from pandas.core.arrays import BooleanArray
 from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
 import pandas.core.common as com
 from pandas.core.construction import array, extract_array, sanitize_array
@@ -1996,10 +1997,14 @@ def __getitem__(self, key):
                 return np.nan
             else:
                 return self.categories[i]
+        elif com.is_bool_indexer(key) and isinstance(key, BooleanArray):
+            take = key._data | key._mask
+            values = self._codes[take]
+            values[key._mask[take]] = -1
         else:
-            return self._constructor(
-                values=self._codes[key], dtype=self.dtype, fastpath=True
-            )
+            values = self._codes[key]
+
+        return self._constructor(values=values, dtype=self.dtype, fastpath=True)
 
     def __setitem__(self, key, value):
         """
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index f5d1e62f44fd0..d448141c27a40 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -40,6 +40,7 @@
 from pandas._typing import DatetimeLikeScalar
 from pandas.core import missing, nanops
 from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts
+from pandas.core.arrays import BooleanArray
 import pandas.core.common as com
 from pandas.core.ops.common import unpack_zerodim_and_defer
 from pandas.core.ops.invalid import make_invalid_op
@@ -415,8 +416,15 @@ def __getitem__(self, key):
             val = getitem(key)
             return self._box_func(val)
 
+        mask = None
         if com.is_bool_indexer(key):
-            key = np.asarray(key, dtype=bool)
+            if isinstance(key, BooleanArray):
+                # TODO: Handle all boolean indexers.
+                mask = key._mask
+                key = key._data | mask
+                mask = mask[key]
+            else:
+                key = np.asarray(key, dtype=bool)
             if key.all():
                 key = slice(0, None, None)
             else:
@@ -438,6 +446,10 @@ def __getitem__(self, key):
                 freq = self.freq
 
         result = getitem(key)
+        if mask is not None:
+            # TODO: Check that we've copied!
+            result[mask] = iNaT
+
         if result.ndim > 1:
             # To support MPL which performs slicing with 2 dim
             # even though it only has 1 dim by definition
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
index d47e7e3df27e1..45ff238bd810b 100644
--- a/pandas/core/arrays/integer.py
+++ b/pandas/core/arrays/integer.py
@@ -21,6 +21,7 @@
     is_scalar,
 )
 from pandas.core.dtypes.dtypes import register_extension_dtype
+from pandas.core.dtypes.inference import is_array_like
 from pandas.core.dtypes.missing import isna, notna
 
 from pandas.core import nanops, ops
@@ -366,10 +367,20 @@ def fmt(x):
         return fmt
 
     def __getitem__(self, item):
+        from pandas.core.arrays import BooleanArray
+
         if is_integer(item):
             if self._mask[item]:
                 return self.dtype.na_value
             return self._data[item]
+        elif is_array_like(item) and is_bool_dtype(item.dtype):
+            if isinstance(item, BooleanArray):
+                # items, mask = item._data, item._mask
+                take = item._data | item._mask
+                result = self._data[take]
+                # output masked anywhere where self is masked, or the input was masked
+                omask = (self._mask | item._mask)[take]
+                return type(self)(result, omask)
         return type(self)(self._data[item], self._mask[item])
 
     def _coerce_to_ndarray(self):
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 2de19a3319cc5..59535803266db 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -13,7 +13,8 @@
 
 from pandas import compat
 from pandas.core import ops
-from pandas.core.arrays import PandasArray
+from pandas.core.arrays import BooleanArray, PandasArray
+import pandas.core.common as com
 from pandas.core.construction import extract_array
 from pandas.core.missing import isna
 
@@ -232,6 +233,19 @@ def __setitem__(self, key, value):
 
         super().__setitem__(key, value)
 
+    def __getitem__(self, item):
+        # Doing this here, as PandasArray.__getitem__ can't guarantee dtype stability
+        # when getting with a boolean mask.
+        if com.is_bool_indexer(item):
+            if isinstance(item, BooleanArray):
+                # items, mask = item._data, item._mask
+                take = item._data | item._mask
+                result = self[take]
+                # That copies, right?
+                result[item._mask[take]] = self.dtype.na_value
+                return result
+        return super().__getitem__(item)
+
     def fillna(self, value=None, method=None, limit=None):
         # TODO: validate dtype
         return super().fillna(value, method, limit)
diff --git a/pandas/core/common.py b/pandas/core/common.py
index 9017584171850..fdbedbbd4cddd 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -132,9 +132,9 @@ def is_bool_indexer(key: Any) -> bool:
         elif is_bool_dtype(key.dtype):
             # an ndarray with bool-dtype by definition has no missing values.
             # So we only need to check for NAs in ExtensionArrays
-            if is_extension_array_dtype(key.dtype):
-                if np.any(key.isna()):
-                    raise ValueError(na_msg)
+            # if is_extension_array_dtype(key.dtype):
+            #     if np.any(key.isna()):
+            #         raise ValueError(na_msg)
             return True
     elif isinstance(key, list):
         try:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index fc2412ceaca0e..4145db3e57ee8 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -65,7 +65,7 @@
 from pandas.core import ops
 from pandas.core.accessor import CachedAccessor
 import pandas.core.algorithms as algos
-from pandas.core.arrays import ExtensionArray
+from pandas.core.arrays import BooleanArray, ExtensionArray
 from pandas.core.base import IndexOpsMixin, PandasObject
 import pandas.core.common as com
 from pandas.core.construction import extract_array
@@ -4014,7 +4014,11 @@ def __getitem__(self, key):
             return promote(getitem(key))
 
         if com.is_bool_indexer(key):
-            key = np.asarray(key, dtype=bool)
+            if isinstance(key, BooleanArray):
+                # TODO: Handle all boolean indexers.
+                key = key._data | key._mask
+            else:
+                key = np.asarray(key, dtype=bool)
 
         key = com.values_from_object(key)
         result = getitem(key)
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index 341262313ddff..e97549b75fb03 100755
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -8,6 +8,7 @@
 from pandas.util._decorators import Appender
 
 from pandas.core.dtypes.common import (
+    is_extension_array_dtype,
     is_float,
     is_integer,
     is_iterator,
@@ -19,6 +20,7 @@
 )
 from pandas.core.dtypes.concat import concat_compat
 from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries
+from pandas.core.dtypes.inference import is_array_like
 from pandas.core.dtypes.missing import _infer_fill_value, isna
 
 import pandas.core.common as com
@@ -2309,6 +2311,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray:
         If the index of the key is unalignable to index.
     """
     result = key
+    mask = None
     if isinstance(key, ABCSeries) and not key.index.equals(index):
         result = result.reindex(index)
         mask = isna(result._values)
@@ -2319,6 +2322,9 @@ def check_bool_indexer(index: Index, key) -> np.ndarray:
                 "the indexed object do not match)."
             )
         result = result.astype(bool)._values
+    elif is_array_like(result) and is_extension_array_dtype(result.dtype):
+        assert result.dtype.kind == "b"
+        mask = isna(result)
     else:
         if is_sparse(result):
             result = result.to_dense()
@@ -2330,7 +2336,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray:
                 "Item wrong length {} instead of {}.".format(len(result), len(index))
             )
 
-    return result
+    return result, mask
 
 
 def convert_missing_indexer(indexer):
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index 8a543832b50fe..10fb1dbd9aea8 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -291,7 +291,7 @@ def __setstate__(self, state):
         self.values = state[1]
         self.ndim = self.values.ndim
 
-    def _slice(self, slicer):
+    def _slice(self, slicer, mask=None):
         """ return a slice of my values """
         return self.values[slicer]
 
@@ -1810,7 +1810,7 @@ def _can_hold_element(self, element: Any) -> bool:
         # We're doing the same as CategoricalBlock here.
         return True
 
-    def _slice(self, slicer):
+    def _slice(self, slicer, mask=None):
         """ return a slice of my values """
 
         # slice the category
@@ -2307,7 +2307,7 @@ def to_dense(self):
         # expects that behavior.
         return np.asarray(self.values, dtype=_NS_DTYPE)
 
-    def _slice(self, slicer):
+    def _slice(self, slicer, mask=None):
         """ return a slice of my values """
         if isinstance(slicer, tuple):
             col, loc = slicer
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index f312b88d9a0bc..44776a6e07aba 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -709,7 +709,7 @@ def combine(self, blocks, copy=True):
 
         return type(self)(new_blocks, axes, do_integrity_check=False)
 
-    def get_slice(self, slobj, axis=0):
+    def get_slice(self, slobj, axis=0, mask=None):
         if axis >= self.ndim:
             raise IndexError("Requested axis not found in manager")
 
@@ -1505,11 +1505,13 @@ def _blklocs(self):
         """ compat with BlockManager """
         return None
 
-    def get_slice(self, slobj, axis=0):
+    def get_slice(self, slobj, axis=0, mask=None):
         if axis >= self.ndim:
             raise IndexError("Requested axis not found in manager")
 
-        return type(self)(self._block._slice(slobj), self.index[slobj], fastpath=True)
+        return type(self)(
+            self._block._slice(slobj, mask=mask), self.index[slobj], fastpath=True
+        )
 
     @property
     def index(self):
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 3eabb70581827..1c3af26f6047a 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -892,11 +892,13 @@ def __getitem__(self, key):
             key = list(key)
 
         if com.is_bool_indexer(key):
-            key = check_bool_indexer(self.index, key)
+            key, mask = check_bool_indexer(self.index, key)
+        else:
+            mask = None
 
-        return self._get_with(key)
+        return self._get_with(key, mask)
 
-    def _get_with(self, key):
+    def _get_with(self, key, mask=None):
         # other: fancy integer or otherwise
         if isinstance(key, slice):
             return self._slice(key)
@@ -917,12 +919,13 @@ def _get_with(self, key):
                         return self._get_values(key)
                 raise
 
-        if not isinstance(key, (list, np.ndarray, Series, Index)):
+        if not is_list_like(key):
             key = list(key)
 
         if isinstance(key, Index):
             key_type = key.inferred_type
         else:
+            # TODO: why not use key.dtype?
             key_type = lib.infer_dtype(key, skipna=False)
 
         if key_type == "integer":
@@ -931,7 +934,7 @@ def _get_with(self, key):
             else:
                 return self._get_values(key)
         elif key_type == "boolean":
-            return self._get_values(key)
+            return self._get_values(key, mask)
 
         if isinstance(key, (list, tuple)):
             # TODO: de-dup with tuple case handled above?
@@ -960,10 +963,10 @@ def _get_values_tuple(self, key):
             self
         )
 
-    def _get_values(self, indexer):
+    def _get_values(self, indexer, mask=None):
         try:
             return self._constructor(
-                self._data.get_slice(indexer), fastpath=True
+                self._data.get_slice(indexer, mask=mask), fastpath=True
             ).__finalize__(self)
         except ValueError:
             # mpl compat if we look up e.g. ser[:, np.newaxis];
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
index cd384d6fdbfad..fcebfe82b0b82 100644
--- a/pandas/tests/frame/indexing/test_indexing.py
+++ b/pandas/tests/frame/indexing/test_indexing.py
@@ -2013,6 +2013,7 @@ def test_index_single_double_tuples(self, tpl):
         tm.assert_frame_equal(result, expected)
 
     def test_boolean_indexing(self):
+        # TODO: parametrize
         idx = list(range(3))
         cols = ["A", "B", "C"]
         df1 = DataFrame(
@@ -2036,6 +2037,7 @@ def test_boolean_indexing(self):
             df1[df1.index[:-1] > 2] = -1
 
     def test_boolean_indexing_mixed(self):
+        # TODO: parametrize?
         df = DataFrame(
             {
                 0: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan},
diff --git a/pandas/tests/indexing/test_na_indexing.py b/pandas/tests/indexing/test_na_indexing.py
new file mode 100644
index 0000000000000..9e1228cdfe6ac
--- /dev/null
+++ b/pandas/tests/indexing/test_na_indexing.py
@@ -0,0 +1,58 @@
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
+@pytest.mark.parametrize(
+    "values, expected, dtype",
+    [
+        ([1, 2, 3], [1, None], "Int64"),
+        (["a", "b", "c"], ["a", pd.NA], "string"),
+        ([None, "b", "c"], [pd.NA, pd.NA], "string"),
+        ([True, False, None], [True, pd.NA], "boolean"),
+        ([None, False, None], [pd.NA, pd.NA], "boolean"),
+        pytest.param(
+            [pd.Timestamp("2000"), pd.Timestamp("2001"), pd.Timestamp("2002")],
+            [pd.Timestamp("2000"), pd.NaT],
+            "datetime64[ns]",
+            marks=pytest.mark.xfail(reson="TODO. Change DatetimeBlock._slice"),
+        ),
+        (
+            [
+                pd.Timestamp("2000", tz="CET"),
+                pd.Timestamp("2001", tz="CET"),
+                pd.Timestamp("2002", tz="CET"),
+            ],
+            [pd.Timestamp("2000", tz="CET"), pd.NaT],
+            "datetime64[ns, cet]",
+        ),
+        pytest.param(
+            [pd.Timedelta("1H"), pd.Timedelta("2H"), pd.Timedelta("3H")],
+            [pd.Timedelta("1H"), pd.NaT],
+            "timedelta64[ns]",
+            marks=pytest.mark.xfail(reson="TODO. Change TimeDeltaBlock._slice"),
+        ),
+        (
+            [pd.Period("2000"), pd.Period("2001"), pd.Period("2002")],
+            [pd.Period("2000"), pd.NaT],
+            "period[A-DEC]",
+        ),
+        (["a", "b", "c"], ["a", None], pd.CategoricalDtype(["a", "b", "c"])),
+    ],
+)
+def test_mask_series(values, expected, dtype):
+    s = pd.Series(values, dtype=dtype, name="name")
+    orig = pd.Series(values, dtype=dtype, name="name")
+    mask = pd.array([True, False, None], dtype="boolean")
+    result = s[mask]
+    # expected = pd.Series([1, pd.NA], index=[0, 2], dtype="Int64")
+    expected = pd.Series(expected, dtype=dtype, name="name", index=[0, 2])
+    tm.assert_series_equal(result, expected)
+    assert result.dtype == s.dtype
+    assert pd.isna(result.iloc[-1])
+
+    tm.assert_equal(s.array[mask], expected.array)
+
+    # ensure no mutation
+    tm.assert_series_equal(s, orig)