Skip to content

Commit b9bcdc3

Browse files
authored
ENH: Enable indexing with nullable Boolean (#31591)
1 parent 80d37ad commit b9bcdc3

File tree

14 files changed

+109
-66
lines changed

14 files changed

+109
-66
lines changed

Diff for: doc/source/user_guide/boolean.rst

+5-5
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,9 @@ Nullable Boolean data type
2020
Indexing with NA values
2121
-----------------------
2222

23-
pandas does not allow indexing with NA values. Attempting to do so
24-
will raise a ``ValueError``.
23+
pandas allows indexing with ``NA`` values in a boolean array, which are treated as ``False``.
24+
25+
.. versionchanged:: 1.0.2
2526

2627
.. ipython:: python
2728
:okexcept:
@@ -30,12 +31,11 @@ will raise a ``ValueError``.
3031
mask = pd.array([True, False, pd.NA], dtype="boolean")
3132
s[mask]
3233
33-
The missing values will need to be explicitly filled with True or False prior
34-
to using the array as a mask.
34+
If you would prefer to keep the ``NA`` values you can manually fill them with ``fillna(True)``.
3535

3636
.. ipython:: python
3737
38-
s[mask.fillna(False)]
38+
s[mask.fillna(True)]
3939
4040
.. _boolean.kleene:
4141

Diff for: doc/source/user_guide/indexing.rst

+10-2
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ of multi-axis indexing.
5959
slices, **both** the start and the stop are included, when present in the
6060
index! See :ref:`Slicing with labels <indexing.slicing_with_labels>`
6161
and :ref:`Endpoints are inclusive <advanced.endpoints_are_inclusive>`.)
62-
* A boolean array
62+
* A boolean array (any ``NA`` values will be treated as ``False``).
6363
* A ``callable`` function with one argument (the calling Series or DataFrame) and
6464
that returns valid output for indexing (one of the above).
6565

@@ -75,7 +75,7 @@ of multi-axis indexing.
7575
* An integer e.g. ``5``.
7676
* A list or array of integers ``[4, 3, 0]``.
7777
* A slice object with ints ``1:7``.
78-
* A boolean array.
78+
* A boolean array (any ``NA`` values will be treated as ``False``).
7979
* A ``callable`` function with one argument (the calling Series or DataFrame) and
8080
that returns valid output for indexing (one of the above).
8181

@@ -374,6 +374,14 @@ For getting values with a boolean array:
374374
df1.loc['a'] > 0
375375
df1.loc[:, df1.loc['a'] > 0]
376376
377+
NA values in a boolean array propogate as ``False``:
378+
379+
.. versionchanged:: 1.0.2
380+
381+
mask = pd.array([True, False, True, False, pd.NA, False], dtype="boolean")
382+
mask
383+
df1[mask]
384+
377385
For getting a value explicitly:
378386

379387
.. ipython:: python

Diff for: doc/source/whatsnew/v1.0.2.rst

+27-2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,33 @@ Fixed regressions
2626

2727
.. ---------------------------------------------------------------------------
2828
29+
Indexing with Nullable Boolean Arrays
30+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31+
32+
Previously indexing with a nullable Boolean array containing ``NA`` would raise a ``ValueError``, however this is now permitted with ``NA`` being treated as ``False``. (:issue:`31503`)
33+
34+
.. ipython:: python
35+
36+
s = pd.Series([1, 2, 3, 4])
37+
mask = pd.array([True, True, False, None], dtype="boolean")
38+
s
39+
mask
40+
41+
*pandas 1.0.0-1.0.1*
42+
43+
.. code-block:: python
44+
45+
>>> s[mask]
46+
Traceback (most recent call last):
47+
...
48+
ValueError: cannot mask with array containing NA / NaN values
49+
50+
*pandas 1.0.2*
51+
52+
.. ipython:: python
53+
54+
s[mask]
55+
2956
.. _whatsnew_102.bug_fixes:
3057

3158
Bug fixes
@@ -45,8 +72,6 @@ Bug fixes
4572
- Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`)
4673
- Fixed bug in parquet roundtrip with nullable unsigned integer dtypes (:issue:`31896`).
4774

48-
49-
5075
**Experimental dtypes**
5176

5277
- Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`).

Diff for: pandas/core/arrays/datetimelike.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -520,7 +520,9 @@ def __getitem__(self, key):
520520
if com.is_bool_indexer(key):
521521
# first convert to boolean, because check_array_indexer doesn't
522522
# allow object dtype
523-
key = np.asarray(key, dtype=bool)
523+
if is_object_dtype(key):
524+
key = np.asarray(key, dtype=bool)
525+
524526
key = check_array_indexer(self, key)
525527
if key.all():
526528
key = slice(0, None, None)

Diff for: pandas/core/common.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -118,24 +118,19 @@ def is_bool_indexer(key: Any) -> bool:
118118
check_array_indexer : Check that `key` is a valid array to index,
119119
and convert to an ndarray.
120120
"""
121-
na_msg = "cannot mask with array containing NA / NaN values"
122121
if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (
123122
is_array_like(key) and is_extension_array_dtype(key.dtype)
124123
):
125124
if key.dtype == np.object_:
126125
key = np.asarray(values_from_object(key))
127126

128127
if not lib.is_bool_array(key):
128+
na_msg = "Cannot mask with non-boolean array containing NA / NaN values"
129129
if isna(key).any():
130130
raise ValueError(na_msg)
131131
return False
132132
return True
133133
elif is_bool_dtype(key.dtype):
134-
# an ndarray with bool-dtype by definition has no missing values.
135-
# So we only need to check for NAs in ExtensionArrays
136-
if is_extension_array_dtype(key.dtype):
137-
if np.any(key.isna()):
138-
raise ValueError(na_msg)
139134
return True
140135
elif isinstance(key, list):
141136
try:

Diff for: pandas/core/indexers.py

+6-8
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pandas.core.dtypes.common import (
1111
is_array_like,
1212
is_bool_dtype,
13+
is_extension_array_dtype,
1314
is_integer_dtype,
1415
is_list_like,
1516
)
@@ -366,14 +367,11 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:
366367
...
367368
IndexError: Boolean index has wrong length: 3 instead of 2.
368369
369-
A ValueError is raised when the mask cannot be converted to
370-
a bool-dtype ndarray.
370+
NA values in a boolean array are treated as False.
371371
372372
>>> mask = pd.array([True, pd.NA])
373373
>>> pd.api.indexers.check_array_indexer(arr, mask)
374-
Traceback (most recent call last):
375-
...
376-
ValueError: Cannot mask with a boolean indexer containing NA values
374+
array([ True, False])
377375
378376
A numpy boolean mask will get passed through (if the length is correct):
379377
@@ -425,10 +423,10 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:
425423

426424
dtype = indexer.dtype
427425
if is_bool_dtype(dtype):
428-
try:
426+
if is_extension_array_dtype(dtype):
427+
indexer = indexer.to_numpy(dtype=bool, na_value=False)
428+
else:
429429
indexer = np.asarray(indexer, dtype=bool)
430-
except ValueError:
431-
raise ValueError("Cannot mask with a boolean indexer containing NA values")
432430

433431
# GH26658
434432
if len(indexer) != len(array):

Diff for: pandas/core/indexing.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
is_iterator,
1414
is_list_like,
1515
is_numeric_dtype,
16+
is_object_dtype,
1617
is_scalar,
1718
is_sequence,
1819
)
@@ -2189,10 +2190,12 @@ def check_bool_indexer(index: Index, key) -> np.ndarray:
21892190
"the indexed object do not match)."
21902191
)
21912192
result = result.astype(bool)._values
2192-
else:
2193-
# key might be sparse / object-dtype bool, check_array_indexer needs bool array
2193+
elif is_object_dtype(key):
2194+
# key might be object-dtype bool, check_array_indexer needs bool array
21942195
result = np.asarray(result, dtype=bool)
21952196
result = check_array_indexer(index, result)
2197+
else:
2198+
result = check_array_indexer(index, result)
21962199

21972200
return result
21982201

Diff for: pandas/tests/arrays/categorical/test_indexing.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -240,14 +240,17 @@ def test_mask_with_boolean(index):
240240

241241

242242
@pytest.mark.parametrize("index", [True, False])
243-
def test_mask_with_boolean_raises(index):
243+
def test_mask_with_boolean_na_treated_as_false(index):
244+
# https://github.com/pandas-dev/pandas/issues/31503
244245
s = Series(range(3))
245246
idx = Categorical([True, False, None])
246247
if index:
247248
idx = CategoricalIndex(idx)
248249

249-
with pytest.raises(ValueError, match="NA / NaN"):
250-
s[idx]
250+
result = s[idx]
251+
expected = s[idx.fillna(False)]
252+
253+
tm.assert_series_equal(result, expected)
251254

252255

253256
@pytest.fixture

Diff for: pandas/tests/extension/base/getitem.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -158,21 +158,23 @@ def test_getitem_boolean_array_mask(self, data):
158158
result = pd.Series(data)[mask]
159159
self.assert_series_equal(result, expected)
160160

161-
def test_getitem_boolean_array_mask_raises(self, data):
161+
def test_getitem_boolean_na_treated_as_false(self, data):
162+
# https://github.com/pandas-dev/pandas/issues/31503
162163
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
163164
mask[:2] = pd.NA
165+
mask[2:4] = True
164166

165-
msg = (
166-
"Cannot mask with a boolean indexer containing NA values|"
167-
"cannot mask with array containing NA / NaN values"
168-
)
169-
with pytest.raises(ValueError, match=msg):
170-
data[mask]
167+
result = data[mask]
168+
expected = data[mask.fillna(False)]
169+
170+
self.assert_extension_array_equal(result, expected)
171171

172172
s = pd.Series(data)
173173

174-
with pytest.raises(ValueError):
175-
s[mask]
174+
result = s[mask]
175+
expected = s[mask.fillna(False)]
176+
177+
self.assert_series_equal(result, expected)
176178

177179
@pytest.mark.parametrize(
178180
"idx",

Diff for: pandas/tests/extension/base/setitem.py

+8-10
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,9 @@ def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
9898
[
9999
np.array([True, True, True, False, False]),
100100
pd.array([True, True, True, False, False], dtype="boolean"),
101+
pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"),
101102
],
102-
ids=["numpy-array", "boolean-array"],
103+
ids=["numpy-array", "boolean-array", "boolean-array-na"],
103104
)
104105
def test_setitem_mask(self, data, mask, box_in_series):
105106
arr = data[:5].copy()
@@ -124,20 +125,17 @@ def test_setitem_mask_raises(self, data, box_in_series):
124125
with pytest.raises(IndexError, match="wrong length"):
125126
data[mask] = data[0]
126127

127-
def test_setitem_mask_boolean_array_raises(self, data, box_in_series):
128-
# missing values in mask
128+
def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
129129
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
130-
mask[:2] = pd.NA
130+
mask[:3] = True
131+
mask[3:5] = pd.NA
131132

132133
if box_in_series:
133134
data = pd.Series(data)
134135

135-
msg = (
136-
"Cannot mask with a boolean indexer containing NA values|"
137-
"cannot mask with array containing NA / NaN values"
138-
)
139-
with pytest.raises(ValueError, match=msg):
140-
data[mask] = data[0]
136+
data[mask] = data[0]
137+
138+
assert (data[:3] == data[0]).all()
141139

142140
@pytest.mark.parametrize(
143141
"idx",

Diff for: pandas/tests/extension/test_numpy.py

-4
Original file line numberDiff line numberDiff line change
@@ -415,10 +415,6 @@ def test_setitem_mask(self, data, mask, box_in_series):
415415
def test_setitem_mask_raises(self, data, box_in_series):
416416
super().test_setitem_mask_raises(data, box_in_series)
417417

418-
@skip_nested
419-
def test_setitem_mask_boolean_array_raises(self, data, box_in_series):
420-
super().test_setitem_mask_boolean_array_raises(data, box_in_series)
421-
422418
@skip_nested
423419
@pytest.mark.parametrize(
424420
"idx",

Diff for: pandas/tests/indexing/test_check_indexer.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,14 @@ def test_valid_input(indexer, expected):
3434
@pytest.mark.parametrize(
3535
"indexer", [[True, False, None], pd.array([True, False, None], dtype="boolean")],
3636
)
37-
def test_bool_raise_missing_values(indexer):
38-
array = np.array([1, 2, 3])
37+
def test_boolean_na_returns_indexer(indexer):
38+
# https://github.com/pandas-dev/pandas/issues/31503
39+
arr = np.array([1, 2, 3])
3940

40-
msg = "Cannot mask with a boolean indexer containing NA values"
41-
with pytest.raises(ValueError, match=msg):
42-
check_array_indexer(array, indexer)
41+
result = check_array_indexer(arr, indexer)
42+
expected = np.array([True, False, False], dtype=bool)
43+
44+
tm.assert_numpy_array_equal(result, expected)
4345

4446

4547
@pytest.mark.parametrize(

Diff for: pandas/tests/indexing/test_na_indexing.py

+19-8
Original file line numberDiff line numberDiff line change
@@ -62,18 +62,29 @@ def test_series_mask_boolean(values, dtype, mask, box_mask, frame):
6262

6363

6464
@pytest.mark.parametrize("frame", [True, False])
65-
def test_indexing_with_na_raises(frame):
65+
def test_na_treated_as_false(frame):
66+
# https://github.com/pandas-dev/pandas/issues/31503
6667
s = pd.Series([1, 2, 3], name="name")
6768

6869
if frame:
6970
s = s.to_frame()
71+
7072
mask = pd.array([True, False, None], dtype="boolean")
71-
match = "cannot mask with array containing NA / NaN values"
72-
with pytest.raises(ValueError, match=match):
73-
s[mask]
7473

75-
with pytest.raises(ValueError, match=match):
76-
s.loc[mask]
74+
result = s[mask]
75+
expected = s[mask.fillna(False)]
76+
77+
result_loc = s.loc[mask]
78+
expected_loc = s.loc[mask.fillna(False)]
7779

78-
with pytest.raises(ValueError, match=match):
79-
s.iloc[mask]
80+
result_iloc = s.iloc[mask]
81+
expected_iloc = s.iloc[mask.fillna(False)]
82+
83+
if frame:
84+
tm.assert_frame_equal(result, expected)
85+
tm.assert_frame_equal(result_loc, expected_loc)
86+
tm.assert_frame_equal(result_iloc, expected_iloc)
87+
else:
88+
tm.assert_series_equal(result, expected)
89+
tm.assert_series_equal(result_loc, expected_loc)
90+
tm.assert_series_equal(result_iloc, expected_iloc)

Diff for: pandas/tests/series/indexing/test_boolean.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def test_getitem_boolean_object(string_series):
7272

7373
# nans raise exception
7474
omask[5:10] = np.nan
75-
msg = "cannot mask with array containing NA / NaN values"
75+
msg = "Cannot mask with non-boolean array containing NA / NaN values"
7676
with pytest.raises(ValueError, match=msg):
7777
s[omask]
7878
with pytest.raises(ValueError, match=msg):

0 commit comments

Comments
 (0)