Skip to content

Commit 88c6126

Browse files
committed
Merge remote-tracking branch 'upstream/master' into ea-xs
2 parents f008c38 + 0480f4c commit 88c6126

File tree

17 files changed

+307
-21
lines changed

17 files changed

+307
-21
lines changed

doc/source/whatsnew/v0.24.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,7 @@ ExtensionType Changes
485485
- ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`)
486486
- ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore
487487
the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`)
488+
- An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`)
488489
- Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`).
489490
- The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`)
490491
- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`)
@@ -617,6 +618,8 @@ Categorical
617618
^^^^^^^^^^^
618619

619620
- Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``.
621+
- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
622+
- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
620623

621624
Datetimelike
622625
^^^^^^^^^^^^

pandas/core/arrays/categorical.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -2439,9 +2439,13 @@ def _get_codes_for_values(values, categories):
24392439
"""
24402440
utility routine to turn values into codes given the specified categories
24412441
"""
2442-
24432442
from pandas.core.algorithms import _get_data_algo, _hashtables
2444-
if not is_dtype_equal(values.dtype, categories.dtype):
2443+
if is_dtype_equal(values.dtype, categories.dtype):
2444+
# To prevent erroneous dtype coercion in _get_data_algo, retrieve
2445+
# the underlying numpy array. gh-22702
2446+
values = getattr(values, 'values', values)
2447+
categories = getattr(categories, 'values', categories)
2448+
else:
24452449
values = ensure_object(values)
24462450
categories = ensure_object(categories)
24472451

pandas/core/common.py

+35-5
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515
from pandas import compat
1616
from pandas.compat import iteritems, PY36, OrderedDict
1717
from pandas.core.dtypes.generic import ABCSeries, ABCIndex, ABCIndexClass
18-
from pandas.core.dtypes.common import is_integer
18+
from pandas.core.dtypes.common import (
19+
is_integer, is_bool_dtype, is_extension_array_dtype, is_array_like
20+
)
1921
from pandas.core.dtypes.inference import _iterable_not_string
2022
from pandas.core.dtypes.missing import isna, isnull, notnull # noqa
2123
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
@@ -100,17 +102,45 @@ def maybe_box_datetimelike(value):
100102

101103

102104
def is_bool_indexer(key):
103-
if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)):
105+
# type: (Any) -> bool
106+
"""
107+
Check whether `key` is a valid boolean indexer.
108+
109+
Parameters
110+
----------
111+
key : Any
112+
Only list-likes may be considered boolean indexers.
113+
All other types are not considered a boolean indexer.
114+
For array-like input, boolean ndarrays or ExtensionArrays
115+
with ``_is_boolean`` set are considered boolean indexers.
116+
117+
Returns
118+
-------
119+
bool
120+
121+
Raises
122+
------
123+
ValueError
124+
When the array is an object-dtype ndarray or ExtensionArray
125+
and contains missing values.
126+
"""
127+
na_msg = 'cannot index with vector containing NA / NaN values'
128+
if (isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or
129+
(is_array_like(key) and is_extension_array_dtype(key.dtype))):
104130
if key.dtype == np.object_:
105131
key = np.asarray(values_from_object(key))
106132

107133
if not lib.is_bool_array(key):
108134
if isna(key).any():
109-
raise ValueError('cannot index with vector containing '
110-
'NA / NaN values')
135+
raise ValueError(na_msg)
111136
return False
112137
return True
113-
elif key.dtype == np.bool_:
138+
elif is_bool_dtype(key.dtype):
139+
# an ndarray with bool-dtype by definition has no missing values.
140+
# So we only need to check for NAs in ExtensionArrays
141+
if is_extension_array_dtype(key.dtype):
142+
if np.any(key.isna()):
143+
raise ValueError(na_msg)
114144
return True
115145
elif isinstance(key, list):
116146
try:

pandas/core/dtypes/base.py

+20
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,25 @@ def _is_numeric(self):
106106
"""
107107
return False
108108

109+
@property
110+
def _is_boolean(self):
111+
# type: () -> bool
112+
"""
113+
Whether this dtype should be considered boolean.
114+
115+
By default, ExtensionDtypes are assumed to be non-numeric.
116+
Setting this to True will affect the behavior of several places,
117+
e.g.
118+
119+
* is_bool
120+
* boolean indexing
121+
122+
Returns
123+
-------
124+
bool
125+
"""
126+
return False
127+
109128

110129
class ExtensionDtype(_DtypeOpsMixin):
111130
"""A custom data type, to be paired with an ExtensionArray.
@@ -125,6 +144,7 @@ class ExtensionDtype(_DtypeOpsMixin):
125144
pandas operations
126145
127146
* _is_numeric
147+
* _is_boolean
128148
129149
Optionally one can override construct_array_type for construction
130150
with the name of this dtype via the Registry. See

pandas/core/dtypes/common.py

+17
Original file line numberDiff line numberDiff line change
@@ -1619,6 +1619,11 @@ def is_bool_dtype(arr_or_dtype):
16191619
-------
16201620
boolean : Whether or not the array or dtype is of a boolean dtype.
16211621
1622+
Notes
1623+
-----
1624+
An ExtensionArray is considered boolean when the ``_is_boolean``
1625+
attribute is set to True.
1626+
16221627
Examples
16231628
--------
16241629
>>> is_bool_dtype(str)
@@ -1635,6 +1640,8 @@ def is_bool_dtype(arr_or_dtype):
16351640
False
16361641
>>> is_bool_dtype(np.array([True, False]))
16371642
True
1643+
>>> is_bool_dtype(pd.Categorical([True, False]))
1644+
True
16381645
"""
16391646

16401647
if arr_or_dtype is None:
@@ -1645,6 +1652,13 @@ def is_bool_dtype(arr_or_dtype):
16451652
# this isn't even a dtype
16461653
return False
16471654

1655+
if isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex)):
1656+
arr_or_dtype = arr_or_dtype.dtype
1657+
1658+
if isinstance(arr_or_dtype, CategoricalDtype):
1659+
arr_or_dtype = arr_or_dtype.categories
1660+
# now we use the special definition for Index
1661+
16481662
if isinstance(arr_or_dtype, ABCIndexClass):
16491663

16501664
# TODO(jreback)
@@ -1653,6 +1667,9 @@ def is_bool_dtype(arr_or_dtype):
16531667
# guess this
16541668
return (arr_or_dtype.is_object and
16551669
arr_or_dtype.inferred_type == 'boolean')
1670+
elif is_extension_array_dtype(arr_or_dtype):
1671+
dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype)
1672+
return dtype._is_boolean
16561673

16571674
return issubclass(tipo, np.bool_)
16581675

pandas/core/dtypes/dtypes.py

+6
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,12 @@ def ordered(self):
462462
"""Whether the categories have an ordered relationship"""
463463
return self._ordered
464464

465+
@property
466+
def _is_boolean(self):
467+
from pandas.core.dtypes.common import is_bool_dtype
468+
469+
return is_bool_dtype(self.categories)
470+
465471

466472
class DatetimeTZDtypeType(type):
467473
"""

pandas/core/frame.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -629,8 +629,8 @@ def _is_homogeneous(self):
629629
>>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous
630630
False
631631
632-
Items with the type but different sizes are considered different
633-
types.
632+
Items with the same type but different sizes are considered
633+
different types.
634634
635635
>>> DataFrame({"A": np.array([1, 2], dtype=np.int32),
636636
... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous

pandas/core/generic.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -9671,15 +9671,15 @@ def _doc_parms(cls):
96719671
original index.
96729672
* None : reduce all axes, return a scalar.
96739673
9674+
bool_only : boolean, default None
9675+
Include only boolean columns. If None, will attempt to use everything,
9676+
then use only boolean data. Not implemented for Series.
96749677
skipna : boolean, default True
96759678
Exclude NA/null values. If an entire row/column is NA, the result
96769679
will be NA.
96779680
level : int or level name, default None
96789681
If the axis is a MultiIndex (hierarchical), count along a
96799682
particular level, collapsing into a %(name1)s.
9680-
bool_only : boolean, default None
9681-
Include only boolean columns. If None, will attempt to use everything,
9682-
then use only boolean data. Not implemented for Series.
96839683
**kwargs : any, default None
96849684
Additional keywords have no effect but might be accepted for
96859685
compatibility with NumPy.

pandas/core/indexes/multi.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ def _is_homogeneous(self):
306306
>>> MultiIndex.from_tuples([('a', 1), ('a', 2)])._is_homogeneous
307307
False
308308
"""
309-
return len(set(x.dtype for x in self.levels)) <= 1
309+
return len({x.dtype for x in self.levels}) <= 1
310310

311311
def _set_levels(self, levels, level=None, copy=False, validate=True,
312312
verify_integrity=False):

pandas/tests/arrays/categorical/test_constructors.py

+6
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ def test_constructor_empty(self):
4242
expected = pd.Int64Index([1, 2, 3])
4343
tm.assert_index_equal(c.categories, expected)
4444

45+
def test_constructor_empty_boolean(self):
46+
# see gh-22702
47+
cat = pd.Categorical([], categories=[True, False])
48+
categories = sorted(cat.categories.tolist())
49+
assert categories == [False, True]
50+
4551
def test_constructor_tuples(self):
4652
values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object)
4753
result = Categorical(values)

pandas/tests/arrays/categorical/test_indexing.py

+26-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
import numpy as np
66

77
import pandas.util.testing as tm
8-
from pandas import Categorical, Index, CategoricalIndex, PeriodIndex
8+
from pandas import Categorical, Index, CategoricalIndex, PeriodIndex, Series
9+
import pandas.core.common as com
910
from pandas.tests.arrays.categorical.common import TestCategorical
1011

1112

@@ -121,3 +122,27 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class):
121122

122123
tm.assert_numpy_array_equal(expected, result)
123124
tm.assert_numpy_array_equal(exp_miss, res_miss)
125+
126+
127+
@pytest.mark.parametrize("index", [True, False])
128+
def test_mask_with_boolean(index):
129+
s = Series(range(3))
130+
idx = Categorical([True, False, True])
131+
if index:
132+
idx = CategoricalIndex(idx)
133+
134+
assert com.is_bool_indexer(idx)
135+
result = s[idx]
136+
expected = s[idx.astype('object')]
137+
tm.assert_series_equal(result, expected)
138+
139+
140+
@pytest.mark.parametrize("index", [True, False])
141+
def test_mask_with_boolean_raises(index):
142+
s = Series(range(3))
143+
idx = Categorical([True, False, None])
144+
if index:
145+
idx = CategoricalIndex(idx)
146+
147+
with tm.assert_raises_regex(ValueError, 'NA / NaN'):
148+
s[idx]

pandas/tests/dtypes/test_dtypes.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
is_dtype_equal, is_datetime64_ns_dtype,
1818
is_datetime64_dtype, is_interval_dtype,
1919
is_datetime64_any_dtype, is_string_dtype,
20-
_coerce_to_dtype)
20+
_coerce_to_dtype, is_bool_dtype)
2121
import pandas.util.testing as tm
2222

2323

@@ -126,6 +126,18 @@ def test_tuple_categories(self):
126126
result = CategoricalDtype(categories)
127127
assert all(result.categories == categories)
128128

129+
@pytest.mark.parametrize("categories, expected", [
130+
([True, False], True),
131+
([True, False, None], True),
132+
([True, False, "a", "b'"], False),
133+
([0, 1], False),
134+
])
135+
def test_is_boolean(self, categories, expected):
136+
cat = Categorical(categories)
137+
assert cat.dtype._is_boolean is expected
138+
assert is_bool_dtype(cat) is expected
139+
assert is_bool_dtype(cat.dtype) is expected
140+
129141

130142
class TestDatetimeTZDtype(Base):
131143

pandas/tests/extension/arrow/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)