Skip to content

Commit 3efaeb3

Browse files
jorisvandenbosscheproost
authored andcommitted
API: Use new NA scalar in BooleanArray (pandas-dev#29961)
1 parent f90ba44 commit 3efaeb3

File tree

5 files changed

+129
-64
lines changed

5 files changed

+129
-64
lines changed

pandas/_libs/missing.pyx

+2-1
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,8 @@ cdef inline bint is_null_period(v):
289289
def _create_binary_propagating_op(name, divmod=False):
290290

291291
def method(self, other):
292-
if other is C_NA or isinstance(other, str) or isinstance(other, numbers.Number):
292+
if (other is C_NA or isinstance(other, str)
293+
or isinstance(other, (numbers.Number, np.bool_))):
293294
if divmod:
294295
return NA, NA
295296
else:

pandas/core/arrays/boolean.py

+52-43
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
import numbers
2-
from typing import TYPE_CHECKING, Type
2+
from typing import TYPE_CHECKING, Any, Tuple, Type
33
import warnings
44

55
import numpy as np
66

7-
from pandas._libs import lib
7+
from pandas._libs import lib, missing as libmissing
88
from pandas.compat import set_function_name
99

1010
from pandas.core.dtypes.base import ExtensionDtype
@@ -61,13 +61,13 @@ class BooleanDtype(ExtensionDtype):
6161
@property
6262
def na_value(self) -> "Scalar":
6363
"""
64-
BooleanDtype uses :attr:`numpy.nan` as the missing NA value.
64+
BooleanDtype uses :attr:`pandas.NA` as the missing NA value.
6565
6666
.. warning::
6767
6868
`na_value` may change in a future release.
6969
"""
70-
return np.nan
70+
return libmissing.NA
7171

7272
@property
7373
def type(self) -> Type:
@@ -223,7 +223,7 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin):
223223
224224
>>> pd.array([True, False, None], dtype="boolean")
225225
<BooleanArray>
226-
[True, False, NaN]
226+
[True, False, NA]
227227
Length: 3, dtype: boolean
228228
"""
229229

@@ -262,17 +262,17 @@ def _from_sequence(cls, scalars, dtype=None, copy: bool = False):
262262
values, mask = coerce_to_array(scalars, copy=copy)
263263
return BooleanArray(values, mask)
264264

265+
def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
266+
data = self._data.astype("int8")
267+
data[self._mask] = -1
268+
return data, -1
269+
265270
@classmethod
266271
def _from_factorized(cls, values, original: "BooleanArray"):
267272
return cls._from_sequence(values, dtype=original.dtype)
268273

269274
def _formatter(self, boxed=False):
270-
def fmt(x):
271-
if isna(x):
272-
return "NaN"
273-
return str(x)
274-
275-
return fmt
275+
return str
276276

277277
def __getitem__(self, item):
278278
if is_integer(item):
@@ -281,25 +281,29 @@ def __getitem__(self, item):
281281
return self._data[item]
282282
return type(self)(self._data[item], self._mask[item])
283283

284-
def _coerce_to_ndarray(self, force_bool: bool = False):
284+
def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
285285
"""
286286
Coerce to an ndarary of object dtype or bool dtype (if force_bool=True).
287287
288288
Parameters
289289
----------
290-
force_bool : bool, default False
291-
If True, return bool array or raise error if not possible (in
292-
presence of missing values)
290+
dtype : dtype, default object
291+
The numpy dtype to convert to
292+
na_value : scalar, optional
293+
Scalar missing value indicator to use in numpy array. Defaults
294+
to the native missing value indicator of this array (pd.NA).
293295
"""
294-
if force_bool:
296+
if dtype is None:
297+
dtype = object
298+
if is_bool_dtype(dtype):
295299
if not self.isna().any():
296300
return self._data
297301
else:
298302
raise ValueError(
299303
"cannot convert to bool numpy array in presence of missing values"
300304
)
301-
data = self._data.astype(object)
302-
data[self._mask] = self._na_value
305+
data = self._data.astype(dtype)
306+
data[self._mask] = na_value
303307
return data
304308

305309
__array_priority__ = 1000 # higher than ndarray so ops dispatch to us
@@ -309,15 +313,8 @@ def __array__(self, dtype=None):
309313
the array interface, return my values
310314
We return an object array here to preserve our scalar values
311315
"""
312-
if dtype is not None:
313-
if is_bool_dtype(dtype):
314-
return self._coerce_to_ndarray(force_bool=True)
315-
# TODO can optimize this to not go through object dtype for
316-
# numeric dtypes
317-
arr = self._coerce_to_ndarray()
318-
return arr.astype(dtype, copy=False)
319316
# by default (no dtype specified), return an object array
320-
return self._coerce_to_ndarray()
317+
return self._coerce_to_ndarray(dtype=dtype)
321318

322319
def __arrow_array__(self, type=None):
323320
"""
@@ -483,8 +480,17 @@ def astype(self, dtype, copy=True):
483480
return IntegerArray(
484481
self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False
485482
)
483+
# for integer, error if there are missing values
484+
if is_integer_dtype(dtype):
485+
if self.isna().any():
486+
raise ValueError("cannot convert NA to integer")
487+
# for float dtype, ensure we use np.nan before casting (numpy cannot
488+
# deal with pd.NA)
489+
na_value = self._na_value
490+
if is_float_dtype(dtype):
491+
na_value = np.nan
486492
# coerce
487-
data = self._coerce_to_ndarray()
493+
data = self._coerce_to_ndarray(na_value=na_value)
488494
return astype_nansafe(data, dtype, copy=None)
489495

490496
def value_counts(self, dropna=True):
@@ -594,8 +600,6 @@ def logical_method(self, other):
594600

595601
@classmethod
596602
def _create_comparison_method(cls, op):
597-
op_name = op.__name__
598-
599603
def cmp_method(self, other):
600604

601605
if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
@@ -617,21 +621,26 @@ def cmp_method(self, other):
617621
if len(self) != len(other):
618622
raise ValueError("Lengths must match to compare")
619623

620-
# numpy will show a DeprecationWarning on invalid elementwise
621-
# comparisons, this will raise in the future
622-
with warnings.catch_warnings():
623-
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
624-
with np.errstate(all="ignore"):
625-
result = op(self._data, other)
626-
627-
# nans propagate
628-
if mask is None:
629-
mask = self._mask
624+
if other is libmissing.NA:
625+
# numpy does not handle pd.NA well as "other" scalar (it returns
626+
# a scalar False instead of an array)
627+
result = np.zeros_like(self._data)
628+
mask = np.ones_like(self._data)
630629
else:
631-
mask = self._mask | mask
630+
# numpy will show a DeprecationWarning on invalid elementwise
631+
# comparisons, this will raise in the future
632+
with warnings.catch_warnings():
633+
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
634+
with np.errstate(all="ignore"):
635+
result = op(self._data, other)
636+
637+
# nans propagate
638+
if mask is None:
639+
mask = self._mask.copy()
640+
else:
641+
mask = self._mask | mask
632642

633-
result[mask] = op_name == "ne"
634-
return BooleanArray(result, np.zeros(len(result), dtype=bool), copy=False)
643+
return BooleanArray(result, mask, copy=False)
635644

636645
name = "__{name}__".format(name=op.__name__)
637646
return set_function_name(cmp_method, name, cls)
@@ -643,7 +652,7 @@ def _reduce(self, name, skipna=True, **kwargs):
643652
# coerce to a nan-aware float if needed
644653
if mask.any():
645654
data = self._data.astype("float64")
646-
data[mask] = self._na_value
655+
data[mask] = np.nan
647656

648657
op = getattr(nanops, "nan" + name)
649658
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)

pandas/tests/arrays/test_boolean.py

+62-15
Original file line numberDiff line numberDiff line change
@@ -101,13 +101,14 @@ def test_to_boolean_array_all_none():
101101
@pytest.mark.parametrize(
102102
"a, b",
103103
[
104-
([True, None], [True, np.nan]),
105-
([None], [np.nan]),
106-
([None, np.nan], [np.nan, np.nan]),
107-
([np.nan, np.nan], [np.nan, np.nan]),
104+
([True, False, None, np.nan, pd.NA], [True, False, None, None, None]),
105+
([True, np.nan], [True, None]),
106+
([True, pd.NA], [True, None]),
107+
([np.nan, np.nan], [None, None]),
108+
(np.array([np.nan, np.nan], dtype=float), [None, None]),
108109
],
109110
)
110-
def test_to_boolean_array_none_is_nan(a, b):
111+
def test_to_boolean_array_missing_indicators(a, b):
111112
result = pd.array(a, dtype="boolean")
112113
expected = pd.array(b, dtype="boolean")
113114
tm.assert_extension_array_equal(result, expected)
@@ -216,7 +217,7 @@ def test_coerce_to_numpy_array():
216217
# with missing values -> object dtype
217218
arr = pd.array([True, False, None], dtype="boolean")
218219
result = np.array(arr)
219-
expected = np.array([True, False, None], dtype="object")
220+
expected = np.array([True, False, pd.NA], dtype="object")
220221
tm.assert_numpy_array_equal(result, expected)
221222

222223
# also with no missing values -> object dtype
@@ -238,12 +239,11 @@ def test_coerce_to_numpy_array():
238239
def test_astype():
239240
# with missing values
240241
arr = pd.array([True, False, None], dtype="boolean")
241-
msg = "cannot convert float NaN to"
242242

243-
with pytest.raises(ValueError, match=msg):
243+
with pytest.raises(ValueError, match="cannot convert NA to integer"):
244244
arr.astype("int64")
245245

246-
with pytest.raises(ValueError, match=msg):
246+
with pytest.raises(ValueError, match="cannot convert float NaN to"):
247247
arr.astype("bool")
248248

249249
result = arr.astype("float64")
@@ -280,6 +280,14 @@ def test_astype_to_integer_array():
280280
tm.assert_extension_array_equal(result, expected)
281281

282282

283+
@pytest.mark.parametrize("na", [None, np.nan, pd.NA])
284+
def test_setitem_missing_values(na):
285+
arr = pd.array([True, False, None], dtype="boolean")
286+
expected = pd.array([True, None, None], dtype="boolean")
287+
arr[1] = na
288+
tm.assert_extension_array_equal(arr, expected)
289+
290+
283291
@pytest.mark.parametrize(
284292
"ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor]
285293
)
@@ -406,9 +414,8 @@ def _compare_other(self, data, op_name, other):
406414
# array
407415
result = pd.Series(op(data, other))
408416
expected = pd.Series(op(data._data, other), dtype="boolean")
409-
410-
# fill the nan locations
411-
expected[data._mask] = op_name == "__ne__"
417+
# propagate NAs
418+
expected[data._mask] = pd.NA
412419

413420
tm.assert_series_equal(result, expected)
414421

@@ -419,9 +426,8 @@ def _compare_other(self, data, op_name, other):
419426
expected = pd.Series(data._data)
420427
expected = op(expected, other)
421428
expected = expected.astype("boolean")
422-
423-
# fill the nan locations
424-
expected[data._mask] = op_name == "__ne__"
429+
# propagate NAs
430+
expected[data._mask] = pd.NA
425431

426432
tm.assert_series_equal(result, expected)
427433

@@ -438,6 +444,47 @@ def test_compare_array(self, data, all_compare_operators):
438444
other = pd.Series([True] * len(data))
439445
self._compare_other(data, op_name, other)
440446

447+
@pytest.mark.parametrize("other", [True, False, pd.NA])
448+
def test_scalar(self, other, all_compare_operators):
449+
op = self.get_op_from_name(all_compare_operators)
450+
a = pd.array([True, False, None], dtype="boolean")
451+
452+
result = op(a, other)
453+
454+
if other is pd.NA:
455+
expected = pd.array([None, None, None], dtype="boolean")
456+
else:
457+
values = op(a._data, other)
458+
expected = BooleanArray(values, a._mask, copy=True)
459+
tm.assert_extension_array_equal(result, expected)
460+
461+
# ensure we haven't mutated anything inplace
462+
result[0] = None
463+
tm.assert_extension_array_equal(
464+
a, pd.array([True, False, None], dtype="boolean")
465+
)
466+
467+
def test_array(self, all_compare_operators):
468+
op = self.get_op_from_name(all_compare_operators)
469+
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
470+
b = pd.array([True, False, None] * 3, dtype="boolean")
471+
472+
result = op(a, b)
473+
474+
values = op(a._data, b._data)
475+
mask = a._mask | b._mask
476+
expected = BooleanArray(values, mask)
477+
tm.assert_extension_array_equal(result, expected)
478+
479+
# ensure we haven't mutated anything inplace
480+
result[0] = None
481+
tm.assert_extension_array_equal(
482+
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
483+
)
484+
tm.assert_extension_array_equal(
485+
b, pd.array([True, False, None] * 3, dtype="boolean")
486+
)
487+
441488

442489
class TestArithmeticOps(BaseOpsUtil):
443490
def test_error(self, data, all_arithmetic_operators):

pandas/tests/extension/test_boolean.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,13 @@ def data_missing_for_sorting(dtype):
6060

6161
@pytest.fixture
6262
def na_cmp():
63-
# we are np.nan
64-
return lambda x, y: np.isnan(x) and np.isnan(y)
63+
# we are pd.NA
64+
return lambda x, y: x is pd.NA and y is pd.NA
6565

6666

6767
@pytest.fixture
6868
def na_value():
69-
return np.nan
69+
return pd.NA
7070

7171

7272
@pytest.fixture
@@ -160,6 +160,14 @@ def check_opname(self, s, op_name, other, exc=None):
160160
def _compare_other(self, s, data, op_name, other):
161161
self.check_opname(s, op_name, other)
162162

163+
@pytest.mark.skip(reason="Tested in tests/arrays/test_boolean.py")
164+
def test_compare_scalar(self, data, all_compare_operators):
165+
pass
166+
167+
@pytest.mark.skip(reason="Tested in tests/arrays/test_boolean.py")
168+
def test_compare_array(self, data, all_compare_operators):
169+
pass
170+
163171

164172
class TestReshaping(base.BaseReshapingTests):
165173
pass

pandas/tests/scalar/test_na_scalar.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -48,15 +48,15 @@ def test_arithmetic_ops(all_arithmetic_functions):
4848

4949
def test_comparison_ops():
5050

51-
for other in [NA, 1, 1.0, "a", np.int64(1), np.nan]:
51+
for other in [NA, 1, 1.0, "a", np.int64(1), np.nan, np.bool_(True)]:
5252
assert (NA == other) is NA
5353
assert (NA != other) is NA
5454
assert (NA > other) is NA
5555
assert (NA >= other) is NA
5656
assert (NA < other) is NA
5757
assert (NA <= other) is NA
5858

59-
if isinstance(other, np.int64):
59+
if isinstance(other, (np.int64, np.bool_)):
6060
# for numpy scalars we get a deprecation warning and False as result
6161
# for equality or error for larger/lesser than
6262
continue

0 commit comments

Comments
 (0)