Skip to content

API: BooleanArray any/all with NA logic #30062

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
pytest -q --doctest-modules pandas/core/arrays/string_.py
RET=$(($RET + $?)) ; echo $MSG "DONE"

MSG='Doctests arrays/boolean.py' ; echo $MSG
pytest -q --doctest-modules pandas/core/arrays/boolean.py
RET=$(($RET + $?)) ; echo $MSG "DONE"

fi

### DOCSTRINGS ###
Expand Down
148 changes: 143 additions & 5 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from pandas._libs import lib, missing as libmissing
from pandas.compat import set_function_name
from pandas.compat.numpy import function as nv

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.cast import astype_nansafe
Expand Down Expand Up @@ -560,6 +561,143 @@ def _values_for_argsort(self) -> np.ndarray:
data[self._mask] = -1
return data

def any(self, skipna=True, **kwargs):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you type

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

"""
Return whether any element is True.

Returns False unless there is at least one element that is True.
By default, NAs are skipped. If ``skipna=False`` is specified and
missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
is used as for logical operations.

Parameters
----------
skipna : bool, default True
Exclude NA values. If the entire array is NA and `skipna` is
True, then the result will be False, as for an empty array.
If `skipna` is False, the result will still be True if there is
at least one element that is True, otherwise NA will be returned
if there are NA's present.
**kwargs : any, default None
Additional keywords have no effect but might be accepted for
compatibility with NumPy.

Returns
-------
bool or :attr:`pandas.NA`

See Also
--------
numpy.any : Numpy version of this method.
BooleanArray.all : Return whether all elements are True.

Examples
--------

The result indicates whether any element is True (and by default
skips NAs):

>>> pd.array([True, False, True]).any()
True
>>> pd.array([True, False, pd.NA]).any()
True
>>> pd.array([False, False, pd.NA]).any()
False
>>> pd.array([], dtype="boolean").any()
False
>>> pd.array([pd.NA], dtype="boolean").any()
False

With ``skipna=False``, the result can be NA if this is logically
required (whether ``pd.NA`` is True or False influences the result):

>>> pd.array([True, False, pd.NA]).any(skipna=False)
True
>>> pd.array([False, False, pd.NA]).any(skipna=False)
NA
"""
kwargs.pop("axis", None)
nv.validate_any((), kwargs)

values = self._data.copy()
np.putmask(values, self._mask, False)
result = values.any()
if skipna:
return result
else:
if result or len(self) == 0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use not len(self)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In pandas/core, we actually use the len(..) == 0 pattern more than not len(..). I personally also find that easier to read.

(the typical pythonic idiom recommendation is about doing if (not) container: instead of if (not) len(container) for empty containers, but that of course doesn't hold for arrays)

return result
else:
return self.dtype.na_value

def all(self, skipna=True, **kwargs):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

type

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

"""
Return whether all elements are True.

Returns True unless there is at least one element that is False.
By default, NAs are skipped. If ``skipna=False`` is specified and
missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
is used as for logical operations.

Parameters
----------
skipna : bool, default True
Exclude NA values. If the entire array is NA and `skipna` is
True, then the result will be True, as for an empty array.
If `skipna` is False, the result will still be False if there is
at least one element that is False, otherwise NA will be returned
if there are NA's present.
**kwargs : any, default None
Additional keywords have no effect but might be accepted for
compatibility with NumPy.

Returns
-------
bool or :attr:`pandas.NA`

See Also
--------
numpy.all : Numpy version of this method.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might want to add a link for kleene logic here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the See Also section, we can only add links to other API pages. But, in the long description of the docstring a bit above, I already included a link about the Kleene logic.

BooleanArray.any : Return whether any element is True.

Examples
--------

The result indicates whether any element is True (and by default
skips NAs):

>>> pd.array([True, True, pd.NA]).all()
True
>>> pd.array([True, False, pd.NA]).all()
False
>>> pd.array([], dtype="boolean").all()
True
>>> pd.array([pd.NA], dtype="boolean").all()
True

With ``skipna=False``, the result can be NA if this is logically
required (whether ``pd.NA`` is True or False influences the result):

>>> pd.array([True, True, pd.NA]).all(skipna=False)
NA
>>> pd.array([True, False, pd.NA]).all(skipna=False)
False
"""
kwargs.pop("axis", None)
nv.validate_all((), kwargs)

values = self._data.copy()
np.putmask(values, self._mask, True)
result = values.all()

if skipna:
return result
else:
if not result or len(self) == 0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

return result
else:
return self.dtype.na_value

@classmethod
def _create_logical_method(cls, op):
def logical_method(self, other):
Expand Down Expand Up @@ -656,6 +794,10 @@ def cmp_method(self, other):
return set_function_name(cmp_method, name, cls)

def _reduce(self, name, skipna=True, **kwargs):

if name in {"any", "all"}:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we usually use lists for these checks

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this file we actually use more in {} than in [] (both are used), but since Tom and I wrote this file, that's probably not an argument ;)
Happy to change it, purely performance wise the set is faster (but this is about nanoseconds of course ..)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Heh, I'm probably to blame for the sets :) I like them more for membership tests, though it doesn't matter for small sets.

return getattr(self, name)(skipna=skipna, **kwargs)

data = self._data
mask = self._mask

Expand All @@ -667,12 +809,8 @@ def _reduce(self, name, skipna=True, **kwargs):
op = getattr(nanops, "nan" + name)
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)

# if we have a boolean op, don't coerce
if name in ["any", "all"]:
pass

# if we have numeric op that would result in an int, coerce to int if possible
elif name in ["sum", "prod"] and notna(result):
if name in ["sum", "prod"] and notna(result):
int_result = np.int64(result)
if int_result == result:
result = int_result
Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/arrays/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,33 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions):
assert isinstance(getattr(s, op)(), np.float64)


@pytest.mark.parametrize(
"values, exp_any, exp_all, exp_any_noskip, exp_all_noskip",
[
([True, pd.NA], True, True, True, pd.NA),
([False, pd.NA], False, False, pd.NA, False),
([pd.NA], False, True, pd.NA, pd.NA),
([], False, True, False, True),
],
)
def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip):
# the methods return numpy scalars
exp_any = pd.NA if exp_any is pd.NA else np.bool_(exp_any)
exp_all = pd.NA if exp_all is pd.NA else np.bool_(exp_all)
exp_any_noskip = pd.NA if exp_any_noskip is pd.NA else np.bool_(exp_any_noskip)
exp_all_noskip = pd.NA if exp_all_noskip is pd.NA else np.bool_(exp_all_noskip)

for con in [pd.array, pd.Series]:
a = con(values, dtype="boolean")
assert a.any() is exp_any
assert a.all() is exp_all
assert a.any(skipna=False) is exp_any_noskip
assert a.all(skipna=False) is exp_all_noskip

assert np.any(a.any()) is exp_any
assert np.all(a.all()) is exp_all


# TODO when BooleanArray coerces to object dtype numpy array, need to do conversion
# manually in the indexing code
# def test_indexing_boolean_mask():
Expand Down