Skip to content

Commit 1bc1d59

Browse files
authored
Backport PR pandas-dev#31591: ENH: Enable indexing with nullable Boolean (pandas-dev#32192)
1 parent eed4bd2 commit 1bc1d59

File tree

14 files changed

+227
-52
lines changed

14 files changed

+227
-52
lines changed

doc/source/user_guide/boolean.rst

+5-5
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,9 @@ Nullable Boolean Data Type
2020
Indexing with NA values
2121
-----------------------
2222

23-
pandas does not allow indexing with NA values. Attempting to do so
24-
will raise a ``ValueError``.
23+
pandas allows indexing with ``NA`` values in a boolean array, which are treated as ``False``.
24+
25+
.. versionchanged:: 1.0.2
2526

2627
.. ipython:: python
2728
:okexcept:
@@ -30,12 +31,11 @@ will raise a ``ValueError``.
3031
mask = pd.array([True, False, pd.NA], dtype="boolean")
3132
s[mask]
3233
33-
The missing values will need to be explicitly filled with True or False prior
34-
to using the array as a mask.
34+
If you would prefer to keep the ``NA`` values you can manually fill them with ``fillna(True)``.
3535

3636
.. ipython:: python
3737
38-
s[mask.fillna(False)]
38+
s[mask.fillna(True)]
3939
4040
.. _boolean.kleene:
4141

doc/source/user_guide/indexing.rst

+10-2
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ of multi-axis indexing.
5959
slices, **both** the start and the stop are included, when present in the
6060
index! See :ref:`Slicing with labels <indexing.slicing_with_labels>`
6161
and :ref:`Endpoints are inclusive <advanced.endpoints_are_inclusive>`.)
62-
* A boolean array
62+
* A boolean array (any ``NA`` values will be treated as ``False``).
6363
* A ``callable`` function with one argument (the calling Series or DataFrame) and
6464
that returns valid output for indexing (one of the above).
6565

@@ -75,7 +75,7 @@ of multi-axis indexing.
7575
* An integer e.g. ``5``.
7676
* A list or array of integers ``[4, 3, 0]``.
7777
* A slice object with ints ``1:7``.
78-
* A boolean array.
78+
* A boolean array (any ``NA`` values will be treated as ``False``).
7979
* A ``callable`` function with one argument (the calling Series or DataFrame) and
8080
that returns valid output for indexing (one of the above).
8181

@@ -374,6 +374,14 @@ For getting values with a boolean array:
374374
df1.loc['a'] > 0
375375
df1.loc[:, df1.loc['a'] > 0]
376376
377+
NA values in a boolean array propogate as ``False``:
378+
379+
.. versionchanged:: 1.0.2
380+
381+
mask = pd.array([True, False, True, False, pd.NA, False], dtype="boolean")
382+
mask
383+
df1[mask]
384+
377385
For getting a value explicitly:
378386

379387
.. ipython:: python

doc/source/whatsnew/v1.0.2.rst

+27-2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,33 @@ Fixed regressions
2626

2727
.. ---------------------------------------------------------------------------
2828
29+
Indexing with Nullable Boolean Arrays
30+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31+
32+
Previously indexing with a nullable Boolean array containing ``NA`` would raise a ``ValueError``, however this is now permitted with ``NA`` being treated as ``False``. (:issue:`31503`)
33+
34+
.. ipython:: python
35+
36+
s = pd.Series([1, 2, 3, 4])
37+
mask = pd.array([True, True, False, None], dtype="boolean")
38+
s
39+
mask
40+
41+
*pandas 1.0.0-1.0.1*
42+
43+
.. code-block:: python
44+
45+
>>> s[mask]
46+
Traceback (most recent call last):
47+
...
48+
ValueError: cannot mask with array containing NA / NaN values
49+
50+
*pandas 1.0.2*
51+
52+
.. ipython:: python
53+
54+
s[mask]
55+
2956
.. _whatsnew_102.bug_fixes:
3057

3158
Bug fixes
@@ -45,8 +72,6 @@ Bug fixes
4572
- Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`)
4673
- Fixed bug in parquet roundtrip with nullable unsigned integer dtypes (:issue:`31896`).
4774

48-
49-
5075
**Experimental dtypes**
5176

5277
- Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`).

pandas/core/arrays/datetimelike.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -520,7 +520,9 @@ def __getitem__(self, key):
520520
if com.is_bool_indexer(key):
521521
# first convert to boolean, because check_array_indexer doesn't
522522
# allow object dtype
523-
key = np.asarray(key, dtype=bool)
523+
if is_object_dtype(key):
524+
key = np.asarray(key, dtype=bool)
525+
524526
key = check_array_indexer(self, key)
525527
if key.all():
526528
key = slice(0, None, None)

pandas/core/common.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -124,24 +124,19 @@ def is_bool_indexer(key: Any) -> bool:
124124
check_array_indexer : Check that `key` is a valid array to index,
125125
and convert to an ndarray.
126126
"""
127-
na_msg = "cannot mask with array containing NA / NaN values"
128127
if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (
129128
is_array_like(key) and is_extension_array_dtype(key.dtype)
130129
):
131130
if key.dtype == np.object_:
132131
key = np.asarray(values_from_object(key))
133132

134133
if not lib.is_bool_array(key):
134+
na_msg = "Cannot mask with non-boolean array containing NA / NaN values"
135135
if isna(key).any():
136136
raise ValueError(na_msg)
137137
return False
138138
return True
139139
elif is_bool_dtype(key.dtype):
140-
# an ndarray with bool-dtype by definition has no missing values.
141-
# So we only need to check for NAs in ExtensionArrays
142-
if is_extension_array_dtype(key.dtype):
143-
if np.any(key.isna()):
144-
raise ValueError(na_msg)
145140
return True
146141
elif isinstance(key, list):
147142
try:

pandas/core/indexers.py

+6-8
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pandas.core.dtypes.common import (
1111
is_array_like,
1212
is_bool_dtype,
13+
is_extension_array_dtype,
1314
is_integer_dtype,
1415
is_list_like,
1516
)
@@ -333,14 +334,11 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:
333334
...
334335
IndexError: Boolean index has wrong length: 3 instead of 2.
335336
336-
A ValueError is raised when the mask cannot be converted to
337-
a bool-dtype ndarray.
337+
NA values in a boolean array are treated as False.
338338
339339
>>> mask = pd.array([True, pd.NA])
340340
>>> pd.api.indexers.check_array_indexer(arr, mask)
341-
Traceback (most recent call last):
342-
...
343-
ValueError: Cannot mask with a boolean indexer containing NA values
341+
array([ True, False])
344342
345343
A numpy boolean mask will get passed through (if the length is correct):
346344
@@ -392,10 +390,10 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:
392390

393391
dtype = indexer.dtype
394392
if is_bool_dtype(dtype):
395-
try:
393+
if is_extension_array_dtype(dtype):
394+
indexer = indexer.to_numpy(dtype=bool, na_value=False)
395+
else:
396396
indexer = np.asarray(indexer, dtype=bool)
397-
except ValueError:
398-
raise ValueError("Cannot mask with a boolean indexer containing NA values")
399397

400398
# GH26658
401399
if len(indexer) != len(array):

pandas/core/indexing.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
is_iterator,
1414
is_list_like,
1515
is_numeric_dtype,
16+
is_object_dtype,
1617
is_scalar,
1718
is_sequence,
1819
)
@@ -2319,10 +2320,12 @@ def check_bool_indexer(index: Index, key) -> np.ndarray:
23192320
"the indexed object do not match)."
23202321
)
23212322
result = result.astype(bool)._values
2322-
else:
2323-
# key might be sparse / object-dtype bool, check_array_indexer needs bool array
2323+
elif is_object_dtype(key):
2324+
# key might be object-dtype bool, check_array_indexer needs bool array
23242325
result = np.asarray(result, dtype=bool)
23252326
result = check_array_indexer(index, result)
2327+
else:
2328+
result = check_array_indexer(index, result)
23262329

23272330
return result
23282331

pandas/tests/arrays/categorical/test_indexing.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -240,14 +240,17 @@ def test_mask_with_boolean(index):
240240

241241

242242
@pytest.mark.parametrize("index", [True, False])
243-
def test_mask_with_boolean_raises(index):
243+
def test_mask_with_boolean_na_treated_as_false(index):
244+
# https://github.com/pandas-dev/pandas/issues/31503
244245
s = Series(range(3))
245246
idx = Categorical([True, False, None])
246247
if index:
247248
idx = CategoricalIndex(idx)
248249

249-
with pytest.raises(ValueError, match="NA / NaN"):
250-
s[idx]
250+
result = s[idx]
251+
expected = s[idx.fillna(False)]
252+
253+
tm.assert_series_equal(result, expected)
251254

252255

253256
@pytest.fixture

pandas/tests/extension/base/getitem.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -158,21 +158,23 @@ def test_getitem_boolean_array_mask(self, data):
158158
result = pd.Series(data)[mask]
159159
self.assert_series_equal(result, expected)
160160

161-
def test_getitem_boolean_array_mask_raises(self, data):
161+
def test_getitem_boolean_na_treated_as_false(self, data):
162+
# https://github.com/pandas-dev/pandas/issues/31503
162163
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
163164
mask[:2] = pd.NA
165+
mask[2:4] = True
164166

165-
msg = (
166-
"Cannot mask with a boolean indexer containing NA values|"
167-
"cannot mask with array containing NA / NaN values"
168-
)
169-
with pytest.raises(ValueError, match=msg):
170-
data[mask]
167+
result = data[mask]
168+
expected = data[mask.fillna(False)]
169+
170+
self.assert_extension_array_equal(result, expected)
171171

172172
s = pd.Series(data)
173173

174-
with pytest.raises(ValueError):
175-
s[mask]
174+
result = s[mask]
175+
expected = s[mask.fillna(False)]
176+
177+
self.assert_series_equal(result, expected)
176178

177179
@pytest.mark.parametrize(
178180
"idx",

pandas/tests/extension/base/setitem.py

+84
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,90 @@ def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
9393
df.iloc[10, 1] = data[1]
9494
assert df.loc[10, "B"] == data[1]
9595

96+
@pytest.mark.parametrize(
97+
"mask",
98+
[
99+
np.array([True, True, True, False, False]),
100+
pd.array([True, True, True, False, False], dtype="boolean"),
101+
pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"),
102+
],
103+
ids=["numpy-array", "boolean-array", "boolean-array-na"],
104+
)
105+
def test_setitem_mask(self, data, mask, box_in_series):
106+
arr = data[:5].copy()
107+
expected = arr.take([0, 0, 0, 3, 4])
108+
if box_in_series:
109+
arr = pd.Series(arr)
110+
expected = pd.Series(expected)
111+
arr[mask] = data[0]
112+
self.assert_equal(expected, arr)
113+
114+
def test_setitem_mask_raises(self, data, box_in_series):
115+
# wrong length
116+
mask = np.array([True, False])
117+
118+
if box_in_series:
119+
data = pd.Series(data)
120+
121+
with pytest.raises(IndexError, match="wrong length"):
122+
data[mask] = data[0]
123+
124+
mask = pd.array(mask, dtype="boolean")
125+
with pytest.raises(IndexError, match="wrong length"):
126+
data[mask] = data[0]
127+
128+
def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
129+
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
130+
mask[:3] = True
131+
mask[3:5] = pd.NA
132+
133+
if box_in_series:
134+
data = pd.Series(data)
135+
136+
data[mask] = data[0]
137+
138+
assert (data[:3] == data[0]).all()
139+
140+
@pytest.mark.parametrize(
141+
"idx",
142+
[[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
143+
ids=["list", "integer-array", "numpy-array"],
144+
)
145+
def test_setitem_integer_array(self, data, idx, box_in_series):
146+
arr = data[:5].copy()
147+
expected = data.take([0, 0, 0, 3, 4])
148+
149+
if box_in_series:
150+
arr = pd.Series(arr)
151+
expected = pd.Series(expected)
152+
153+
arr[idx] = arr[0]
154+
self.assert_equal(arr, expected)
155+
156+
@pytest.mark.parametrize(
157+
"idx, box_in_series",
158+
[
159+
([0, 1, 2, pd.NA], False),
160+
pytest.param(
161+
[0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948")
162+
),
163+
(pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
164+
(pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
165+
],
166+
ids=["list-False", "list-True", "integer-array-False", "integer-array-True"],
167+
)
168+
def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
169+
arr = data.copy()
170+
171+
# TODO(xfail) this raises KeyError about labels not found (it tries label-based)
172+
# for list of labels with Series
173+
if box_in_series:
174+
arr = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))])
175+
176+
msg = "Cannot index with an integer indexer containing NA values"
177+
with pytest.raises(ValueError, match=msg):
178+
arr[idx] = arr[0]
179+
96180
@pytest.mark.parametrize("as_callable", [True, False])
97181
@pytest.mark.parametrize("setter", ["loc", None])
98182
def test_setitem_mask_aligned(self, data, as_callable, setter):

pandas/tests/extension/test_numpy.py

+42
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,48 @@ def test_setitem_scalar_key_sequence_raise(self, data):
396396
# Failed: DID NOT RAISE <class 'ValueError'>
397397
super().test_setitem_scalar_key_sequence_raise(data)
398398

399+
# TODO: there is some issue with PandasArray, therefore,
400+
# skip the setitem test for now, and fix it later (GH 31446)
401+
402+
@skip_nested
403+
@pytest.mark.parametrize(
404+
"mask",
405+
[
406+
np.array([True, True, True, False, False]),
407+
pd.array([True, True, True, False, False], dtype="boolean"),
408+
],
409+
ids=["numpy-array", "boolean-array"],
410+
)
411+
def test_setitem_mask(self, data, mask, box_in_series):
412+
super().test_setitem_mask(data, mask, box_in_series)
413+
414+
@skip_nested
415+
def test_setitem_mask_raises(self, data, box_in_series):
416+
super().test_setitem_mask_raises(data, box_in_series)
417+
418+
@skip_nested
419+
@pytest.mark.parametrize(
420+
"idx",
421+
[[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
422+
ids=["list", "integer-array", "numpy-array"],
423+
)
424+
def test_setitem_integer_array(self, data, idx, box_in_series):
425+
super().test_setitem_integer_array(data, idx, box_in_series)
426+
427+
@skip_nested
428+
@pytest.mark.parametrize(
429+
"idx, box_in_series",
430+
[
431+
([0, 1, 2, pd.NA], False),
432+
pytest.param([0, 1, 2, pd.NA], True, marks=pytest.mark.xfail),
433+
(pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
434+
(pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
435+
],
436+
ids=["list-False", "list-True", "integer-array-False", "integer-array-True"],
437+
)
438+
def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
439+
super().test_setitem_integer_with_missing_raises(data, idx, box_in_series)
440+
399441
@skip_nested
400442
def test_setitem_slice(self, data, box_in_series):
401443
super().test_setitem_slice(data, box_in_series)

0 commit comments

Comments
 (0)