pandas-dev · jreback · Jan 3, 2020 · Dec 16, 2019 · Dec 18, 2019 · Dec 18, 2019
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -131,6 +131,7 @@ def setup(self):
         self.col_scalar = columns[10]
         self.bool_indexer = self.df[self.col_scalar] > 0
         self.bool_obj_indexer = self.bool_indexer.astype(object)
+        self.boolean_indexer = (self.df[self.col_scalar] > 0).astype("boolean")
 
     def time_loc(self):
         self.df.loc[self.idx_scalar, self.col_scalar]
@@ -144,6 +145,9 @@ def time_boolean_rows(self):
     def time_boolean_rows_object(self):
         self.df[self.bool_obj_indexer]
 
+    def time_boolean_rows_boolean(self):
+        self.df[self.boolean_indexer]
+
 
 class DataFrameNumericIndexing:
     def setup(self):

diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
@@ -18,6 +18,8 @@ objects.
    api.extensions.register_series_accessor
    api.extensions.register_index_accessor
    api.extensions.ExtensionDtype
+   api.extensions.is_bool_indexer
+   api.extensions.check_bool_array_indexer
 
 .. autosummary::
    :toctree: api/
@@ -26,6 +28,7 @@ objects.
    api.extensions.ExtensionArray
    arrays.PandasArray
 
+
 .. We need this autosummary so that methods and attributes are generated.
 .. Separate block, since they aren't classes.
 

diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst
@@ -14,6 +14,29 @@ Nullable Boolean Data Type
 
 .. versionadded:: 1.0.0
 
+
+.. _boolean.indexing:
+
+Indexing with NA values
+-----------------------
+
+pandas does not allow indexing with NA values. Attempting to do so
+will raise a ``ValueError``.
+
+.. ipython:: python
+   :okexcept:
+
+   s = pd.Series([1, 2, 3])
+   mask = pd.array([True, False, pd.NA], dtype="boolean")
+   s[mask]
+
+The missing values will need to be explicitly filled with True or False prior
+to using the array as a mask.
+
+.. ipython:: python
+
+   s[mask.fillna(False)]
+
 .. _boolean.kleene:
 
 Kleene Logical Operations

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -708,6 +708,7 @@ Datetimelike
 - Bug in :func:`pandas._config.localization.get_locales` where the ``locales -a`` encodes the locales list as windows-1252 (:issue:`23638`, :issue:`24760`, :issue:`27368`)
 - Bug in :meth:`Series.var` failing to raise ``TypeError`` when called with ``timedelta64[ns]`` dtype (:issue:`28289`)
 - Bug in :meth:`DatetimeIndex.strftime` and :meth:`Series.dt.strftime` where ``NaT`` was converted to the string ``'NaT'`` instead of ``np.nan`` (:issue:`29578`)
+- Bug in masking datetime-like arrays with a boolean mask of an incorrect length not raising an ``IndexError`` (:issue:`30308`)
 - Bug in :attr:`Timestamp.resolution` being a property instead of a class attribute (:issue:`29910`)
 - Bug in :func:`pandas.to_datetime` when called with ``None`` raising ``TypeError`` instead of returning ``NaT`` (:issue:`30011`)
 - Bug in :func:`pandas.to_datetime` failing for `deques` when using ``cache=True`` (the default) (:issue:`29403`)

diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py
@@ -11,3 +11,5 @@
 )
 from pandas.core.algorithms import take  # noqa: F401
 from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin  # noqa: F401
+from pandas.core.common import is_bool_indexer  # noqa: F401
+from pandas.core.indexing import check_bool_array_indexer  # noqa: F401
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -29,6 +29,7 @@
 from pandas.core import nanops, ops
 from pandas.core.algorithms import take
 from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
+from pandas.core.common import is_bool_indexer
 
 if TYPE_CHECKING:
     from pandas._typing import Scalar
@@ -307,11 +308,25 @@ def _from_factorized(cls, values, original: "BooleanArray"):
     def _formatter(self, boxed=False):
         return str
 
+    @property
+    def _hasna(self) -> bool:
+        # Note: this is expensive right now! The hope is that we can
+        # make this faster by having an optional mask, but not have to change
+        # source code using it..
+        return self._mask.any()
+
     def __getitem__(self, item):
+        # import here to avoid circular import. Probably need to restructure
+        from pandas.core.indexing import check_bool_array_indexer
+
         if is_integer(item):
             if self._mask[item]:
                 return self.dtype.na_value
             return self._data[item]
+
+        elif is_bool_indexer(item):
+            item = check_bool_array_indexer(self, item)
+
         return type(self)(self._data[item], self._mask[item])
 
     def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
@@ -329,7 +344,7 @@ def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
         if dtype is None:
             dtype = object
         if is_bool_dtype(dtype):
-            if not self.isna().any():
+            if not self._hasna:
                 return self._data
             else:
                 raise ValueError(
@@ -503,7 +518,7 @@ def astype(self, dtype, copy=True):
 
         if is_bool_dtype(dtype):
             # astype_nansafe converts np.nan to True
-            if self.isna().any():
+            if self._hasna:
                 raise ValueError("cannot convert float NaN to bool")
             else:
                 return self._data.astype(dtype, copy=copy)
@@ -515,7 +530,7 @@ def astype(self, dtype, copy=True):
             )
         # for integer, error if there are missing values
         if is_integer_dtype(dtype):
-            if self.isna().any():
+            if self._hasna:
                 raise ValueError("cannot convert NA to integer")
         # for float dtype, ensure we use np.nan before casting (numpy cannot
         # deal with pd.NA)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1990,16 +1990,21 @@ def __getitem__(self, key):
         """
         Return an item.
         """
+        from pandas.core.indexing import check_bool_array_indexer
+
         if isinstance(key, (int, np.integer)):
             i = self._codes[key]
             if i == -1:
                 return np.nan
             else:
                 return self.categories[i]
-        else:
-            return self._constructor(
-                values=self._codes[key], dtype=self.dtype, fastpath=True
-            )
+
+        elif com.is_bool_indexer(key):
+            key = check_bool_array_indexer(self, key)
+
+        return self._constructor(
+            values=self._codes[key], dtype=self.dtype, fastpath=True
+        )
 
     def __setitem__(self, key, value):
         """

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -416,7 +416,9 @@ def __getitem__(self, key):
             return self._box_func(val)
 
         if com.is_bool_indexer(key):
-            key = np.asarray(key, dtype=bool)
+            from pandas.core.indexing import check_bool_array_indexer
+
+            key = check_bool_array_indexer(self, key)
             if key.all():
                 key = slice(0, None, None)
             else:

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -26,6 +26,7 @@
 from pandas.core import nanops, ops
 from pandas.core.algorithms import take
 from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
+from pandas.core.common import is_bool_indexer
 from pandas.core.ops import invalid_comparison
 from pandas.core.ops.common import unpack_zerodim_and_defer
 from pandas.core.tools.numeric import to_numeric
@@ -367,10 +368,17 @@ def fmt(x):
         return fmt
 
     def __getitem__(self, item):
+        # Importing this at the top-level causes many unrelated(?) mypy failures
+        from pandas.core.indexing import check_bool_array_indexer
+
         if is_integer(item):
             if self._mask[item]:
                 return self.dtype.na_value
             return self._data[item]
+
+        elif is_bool_indexer(item):
+            item = check_bool_array_indexer(self, item)
+
         return type(self)(self._data[item], self._mask[item])
 
     def _coerce_to_ndarray(self):

diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
@@ -17,6 +17,7 @@
 from pandas import compat
 from pandas.core import nanops
 from pandas.core.algorithms import searchsorted, take, unique
+from pandas.core.common import is_bool_indexer
 from pandas.core.construction import extract_array
 from pandas.core.missing import backfill_1d, pad_1d
 
@@ -231,9 +232,15 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
     # Pandas ExtensionArray Interface
 
     def __getitem__(self, item):
+        # Avoid mypy failures when importing at the top-level
+        from pandas.core.indexing import check_bool_array_indexer
+
         if isinstance(item, type(self)):
             item = item._ndarray
 
+        elif is_bool_indexer(item):
+            item = check_bool_array_indexer(self, item)
+
         result = self._ndarray[item]
         if not lib.is_scalar(item):
             result = type(self)(result)

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -738,6 +738,9 @@ def value_counts(self, dropna=True):
     # --------
 
     def __getitem__(self, key):
+        # avoid mypy issues when importing at the top-level
+        from pandas.core.indexing import check_bool_indexer
+
         if isinstance(key, tuple):
             if len(key) > 1:
                 raise IndexError("too many indices for array.")
@@ -766,7 +769,9 @@ def __getitem__(self, key):
                 else:
                     key = np.asarray(key)
 
-            if com.is_bool_indexer(key) and len(self) == len(key):
+            if com.is_bool_indexer(key):
+                key = check_bool_indexer(self, key)
+
                 return self.take(np.arange(len(key), dtype=np.int32)[key])
             elif hasattr(key, "__len__"):
                 return self.take(key)

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -110,12 +110,18 @@ def is_bool_indexer(key: Any) -> bool:
     Returns
     -------
     bool
+        Whether `key` is a valid boolean indexer.
 
     Raises
     ------
     ValueError
         When the array is an object-dtype ndarray or ExtensionArray
         and contains missing values.
+
+    See Also
+    --------
+    api.extensions.check_bool_array_indexer : Check that `key`
+        is a valid mask for an array, and convert to an ndarary.
     """
     na_msg = "cannot index with vector containing NA / NaN values"
     if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -21,6 +21,7 @@
 from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries
 from pandas.core.dtypes.missing import _infer_fill_value, isna
 
+from pandas._typing import AnyArrayLike
 import pandas.core.common as com
 from pandas.core.indexers import is_list_like_indexer, length_of_indexer
 from pandas.core.indexes.api import Index, InvalidIndexError
@@ -2268,6 +2269,69 @@ def convert_to_index_sliceable(obj, key):
     return None
 
 
+def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray:
+    """
+    Check if `mask` is a valid boolean indexer for `array`.
+
+    `array` and `mask` are checked to have the same length, and the
+    dtype is validated.
+
+    Parameters
+    ----------
+    array : array
+        The array that's being masked.
+    mask : array
+        The boolean array that's masking.
+
+    Returns
+    -------
+    numpy.ndarray
+        The validated boolean mask.
+
+    Raises
+    ------
+    IndexError
+        When the lengths don't match.
+    ValueError
+        When `mask` cannot be converted to a bool-dtype ndarray.
+
+    See Also
+    --------
+    api.extensions.is_bool_indexer : Check if `key` is a boolean indexer.
+
+    Examples
+    --------
+    A boolean ndarray is returned when the arguments are all valid.
+
+    >>> mask = pd.array([True, False])
+    >>> arr = pd.Series([1, 2])
+    >>> pd.api.extensions.check_bool_array_indexer(arr, mask)
+    array([ True, False])
+
+    An IndexError is raised when the lengths don't match.
+
+    >>> mask = pd.array([True, False, True])
+    >>> pd.api.extensions.check_bool_array_indexer(arr, mask)
+    Traceback (most recent call last):
+    ...
+    IndexError: Item wrong length 3 instead of 2.
+
+    A ValueError is raised when the mask cannot be converted to
+    a bool-dtype ndarray.
+
+    >>> mask = pd.array([True, pd.NA])
+    >>> pd.api.extensions.check_bool_array_indexer(arr, mask)
+    Traceback (most recent call last):
+    ...
+    ValueError: cannot convert to bool numpy array in presence of missing values
+    """
+    result = np.asarray(mask, dtype=bool)
+    # GH26658
+    if len(result) != len(array):
+        raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.")
+    return result
+
+
 def check_bool_indexer(index: Index, key) -> np.ndarray:
     """
     Check if key is a valid boolean indexer for an object with such index and
@@ -2308,13 +2372,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray:
     else:
         if is_sparse(result):
             result = result.to_dense()
-        result = np.asarray(result, dtype=bool)
-
-        # GH26658
-        if len(result) != len(index):
-            raise IndexError(
-                f"Item wrong length {len(result)} instead of {len(index)}."
-            )
+        result = check_bool_array_indexer(index, result)
 
     return result