is_bool_dtype for ExtensionArrays (pandas-dev#22667)

TomAugspurger · victor · commit 4f44f84f332d · 2018-10-01T01:56:16.000+02:00
Closes pandas-dev#22665 Closes pandas-dev#22326
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -485,6 +485,7 @@ ExtensionType Changes
 - ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`)
 - ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore
   the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`)
+- An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`)
 - Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`).
 - The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`)
 - Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`)
@@ -616,7 +617,8 @@ Categorical
 ^^^^^^^^^^^
 
 - Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``.
-- Constructing a :class:`pd.CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
+- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
+- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
 
 Datetimelike
 ^^^^^^^^^^^^
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -15,7 +15,9 @@
 from pandas import compat
 from pandas.compat import iteritems, PY36, OrderedDict
 from pandas.core.dtypes.generic import ABCSeries, ABCIndex, ABCIndexClass
-from pandas.core.dtypes.common import is_integer
+from pandas.core.dtypes.common import (
+    is_integer, is_bool_dtype, is_extension_array_dtype, is_array_like
+)
 from pandas.core.dtypes.inference import _iterable_not_string
 from pandas.core.dtypes.missing import isna, isnull, notnull  # noqa
 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
@@ -100,17 +102,45 @@ def maybe_box_datetimelike(value):
 
 
 def is_bool_indexer(key):
-    if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)):
+    # type: (Any) -> bool
+    """
+    Check whether `key` is a valid boolean indexer.
+
+    Parameters
+    ----------
+    key : Any
+        Only list-likes may be considered boolean indexers.
+        All other types are not considered a boolean indexer.
+        For array-like input, boolean ndarrays or ExtensionArrays
+        with ``_is_boolean`` set are considered boolean indexers.
+
+    Returns
+    -------
+    bool
+
+    Raises
+    ------
+    ValueError
+        When the array is an object-dtype ndarray or ExtensionArray
+        and contains missing values.
+    """
+    na_msg = 'cannot index with vector containing NA / NaN values'
+    if (isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or
+            (is_array_like(key) and is_extension_array_dtype(key.dtype))):
         if key.dtype == np.object_:
             key = np.asarray(values_from_object(key))
 
             if not lib.is_bool_array(key):
                 if isna(key).any():
-                    raise ValueError('cannot index with vector containing '
-                                     'NA / NaN values')
+                    raise ValueError(na_msg)
                 return False
             return True
-        elif key.dtype == np.bool_:
+        elif is_bool_dtype(key.dtype):
+            # an ndarray with bool-dtype by definition has no missing values.
+            # So we only need to check for NAs in ExtensionArrays
+            if is_extension_array_dtype(key.dtype):
+                if np.any(key.isna()):
+                    raise ValueError(na_msg)
             return True
     elif isinstance(key, list):
         try:
diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -106,6 +106,25 @@ def _is_numeric(self):
         """
         return False
 
+    @property
+    def _is_boolean(self):
+        # type: () -> bool
+        """
+        Whether this dtype should be considered boolean.
+
+        By default, ExtensionDtypes are assumed to be non-numeric.
+        Setting this to True will affect the behavior of several places,
+        e.g.
+
+        * is_bool
+        * boolean indexing
+
+        Returns
+        -------
+        bool
+        """
+        return False
+
 
 class ExtensionDtype(_DtypeOpsMixin):
     """A custom data type, to be paired with an ExtensionArray.
@@ -125,6 +144,7 @@ class ExtensionDtype(_DtypeOpsMixin):
     pandas operations
 
     * _is_numeric
+    * _is_boolean
 
     Optionally one can override construct_array_type for construction
     with the name of this dtype via the Registry. See
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -1619,6 +1619,11 @@ def is_bool_dtype(arr_or_dtype):
     -------
     boolean : Whether or not the array or dtype is of a boolean dtype.
 
+    Notes
+    -----
+    An ExtensionArray is considered boolean when the ``_is_boolean``
+    attribute is set to True.
+
     Examples
     --------
     >>> is_bool_dtype(str)
@@ -1635,6 +1640,8 @@ def is_bool_dtype(arr_or_dtype):
     False
     >>> is_bool_dtype(np.array([True, False]))
     True
+    >>> is_bool_dtype(pd.Categorical([True, False]))
+    True
     """
 
     if arr_or_dtype is None:
@@ -1645,6 +1652,13 @@ def is_bool_dtype(arr_or_dtype):
         # this isn't even a dtype
         return False
 
+    if isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex)):
+        arr_or_dtype = arr_or_dtype.dtype
+
+    if isinstance(arr_or_dtype, CategoricalDtype):
+        arr_or_dtype = arr_or_dtype.categories
+        # now we use the special definition for Index
+
     if isinstance(arr_or_dtype, ABCIndexClass):
 
         # TODO(jreback)
@@ -1653,6 +1667,9 @@ def is_bool_dtype(arr_or_dtype):
         # guess this
         return (arr_or_dtype.is_object and
                 arr_or_dtype.inferred_type == 'boolean')
+    elif is_extension_array_dtype(arr_or_dtype):
+        dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype)
+        return dtype._is_boolean
 
     return issubclass(tipo, np.bool_)
 
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -462,6 +462,12 @@ def ordered(self):
         """Whether the categories have an ordered relationship"""
         return self._ordered
 
+    @property
+    def _is_boolean(self):
+        from pandas.core.dtypes.common import is_bool_dtype
+
+        return is_bool_dtype(self.categories)
+
 
 class DatetimeTZDtypeType(type):
     """
diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py
@@ -5,7 +5,8 @@
 import numpy as np
 
 import pandas.util.testing as tm
-from pandas import Categorical, Index, CategoricalIndex, PeriodIndex
+from pandas import Categorical, Index, CategoricalIndex, PeriodIndex, Series
+import pandas.core.common as com
 from pandas.tests.arrays.categorical.common import TestCategorical
 
 
@@ -121,3 +122,27 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class):
 
             tm.assert_numpy_array_equal(expected, result)
             tm.assert_numpy_array_equal(exp_miss, res_miss)
+
+
+@pytest.mark.parametrize("index", [True, False])
+def test_mask_with_boolean(index):
+    s = Series(range(3))
+    idx = Categorical([True, False, True])
+    if index:
+        idx = CategoricalIndex(idx)
+
+    assert com.is_bool_indexer(idx)
+    result = s[idx]
+    expected = s[idx.astype('object')]
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("index", [True, False])
+def test_mask_with_boolean_raises(index):
+    s = Series(range(3))
+    idx = Categorical([True, False, None])
+    if index:
+        idx = CategoricalIndex(idx)
+
+    with tm.assert_raises_regex(ValueError, 'NA / NaN'):
+        s[idx]
diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
@@ -17,7 +17,7 @@
     is_dtype_equal, is_datetime64_ns_dtype,
     is_datetime64_dtype, is_interval_dtype,
     is_datetime64_any_dtype, is_string_dtype,
-    _coerce_to_dtype)
+    _coerce_to_dtype, is_bool_dtype)
 import pandas.util.testing as tm
 
 
@@ -126,6 +126,18 @@ def test_tuple_categories(self):
         result = CategoricalDtype(categories)
         assert all(result.categories == categories)
 
+    @pytest.mark.parametrize("categories, expected", [
+        ([True, False], True),
+        ([True, False, None], True),
+        ([True, False, "a", "b'"], False),
+        ([0, 1], False),
+    ])
+    def test_is_boolean(self, categories, expected):
+        cat = Categorical(categories)
+        assert cat.dtype._is_boolean is expected
+        assert is_bool_dtype(cat) is expected
+        assert is_bool_dtype(cat.dtype) is expected
+
 
 class TestDatetimeTZDtype(Base):
 
diff --git a/pandas/tests/extension/arrow/__init__.py b/pandas/tests/extension/arrow/__init__.py
diff --git a/pandas/tests/extension/arrow/bool.py b/pandas/tests/extension/arrow/bool.py
@@ -0,0 +1,108 @@
+"""Rudimentary Apache Arrow-backed ExtensionArray.
+
+At the moment, just a boolean array / type is implemented.
+Eventually, we'll want to parametrize the type and support
+multiple dtypes. Not all methods are implemented yet, and the
+current implementation is not efficient.
+"""
+import copy
+import itertools
+
+import numpy as np
+import pyarrow as pa
+import pandas as pd
+from pandas.api.extensions import (
+    ExtensionDtype, ExtensionArray, take, register_extension_dtype
+)
+
+
+@register_extension_dtype
+class ArrowBoolDtype(ExtensionDtype):
+
+    type = np.bool_
+    kind = 'b'
+    name = 'arrow_bool'
+    na_value = pa.NULL
+
+    @classmethod
+    def construct_from_string(cls, string):
+        if string == cls.name:
+            return cls()
+        else:
+            raise TypeError("Cannot construct a '{}' from "
+                            "'{}'".format(cls, string))
+
+    @classmethod
+    def construct_array_type(cls):
+        return ArrowBoolArray
+
+    def _is_boolean(self):
+        return True
+
+
+class ArrowBoolArray(ExtensionArray):
+    def __init__(self, values):
+        if not isinstance(values, pa.ChunkedArray):
+            raise ValueError
+
+        assert values.type == pa.bool_()
+        self._data = values
+        self._dtype = ArrowBoolDtype()
+
+    def __repr__(self):
+        return "ArrowBoolArray({})".format(repr(self._data))
+
+    @classmethod
+    def from_scalars(cls, values):
+        arr = pa.chunked_array([pa.array(np.asarray(values))])
+        return cls(arr)
+
+    @classmethod
+    def from_array(cls, arr):
+        assert isinstance(arr, pa.Array)
+        return cls(pa.chunked_array([arr]))
+
+    @classmethod
+    def _from_sequence(cls, scalars, dtype=None, copy=False):
+        return cls.from_scalars(scalars)
+
+    def __getitem__(self, item):
+        return self._data.to_pandas()[item]
+
+    def __len__(self):
+        return len(self._data)
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def nbytes(self):
+        return sum(x.size for chunk in self._data.chunks
+                   for x in chunk.buffers()
+                   if x is not None)
+
+    def isna(self):
+        return pd.isna(self._data.to_pandas())
+
+    def take(self, indices, allow_fill=False, fill_value=None):
+        data = self._data.to_pandas()
+
+        if allow_fill and fill_value is None:
+            fill_value = self.dtype.na_value
+
+        result = take(data, indices, fill_value=fill_value,
+                      allow_fill=allow_fill)
+        return self._from_sequence(result, dtype=self.dtype)
+
+    def copy(self, deep=False):
+        if deep:
+            return copy.deepcopy(self._data)
+        else:
+            return copy.copy(self._data)
+
+    def _concat_same_type(cls, to_concat):
+        chunks = list(itertools.chain.from_iterable(x._data.chunks
+                                                    for x in to_concat))
+        arr = pa.chunked_array(chunks)
+        return cls(arr)
diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py
@@ -0,0 +1,48 @@
+import numpy as np
+import pytest
+import pandas as pd
+import pandas.util.testing as tm
+from pandas.tests.extension import base
+
+pytest.importorskip('pyarrow', minversion="0.10.0")
+
+from .bool import ArrowBoolDtype, ArrowBoolArray
+
+
+@pytest.fixture
+def dtype():
+    return ArrowBoolDtype()
+
+
+@pytest.fixture
+def data():
+    return ArrowBoolArray.from_scalars(np.random.randint(0, 2, size=100,
+                                       dtype=bool))
+
+
+class BaseArrowTests(object):
+    pass
+
+
+class TestDtype(BaseArrowTests, base.BaseDtypeTests):
+    def test_array_type_with_arg(self, data, dtype):
+        pytest.skip("GH-22666")
+
+
+class TestInterface(BaseArrowTests, base.BaseInterfaceTests):
+    def test_repr(self, data):
+        raise pytest.skip("TODO")
+
+
+class TestConstructors(BaseArrowTests, base.BaseConstructorsTests):
+    def test_from_dtype(self, data):
+        pytest.skip("GH-22666")
+
+
+def test_is_bool_dtype(data):
+    assert pd.api.types.is_bool_dtype(data)
+    assert pd.core.common.is_bool_indexer(data)
+    s = pd.Series(range(len(data)))
+    result = s[data]
+    expected = s[np.asarray(data)]
+    tm.assert_series_equal(result, expected)