Skip to content

ENH/BUG: Use Kleene logic for groupby any/all #40819

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 40 commits into from
Apr 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
088ca14
WIP
mzeitlin11 Apr 6, 2021
2554921
TSTS: Consolidate groupby any, all
mzeitlin11 Apr 6, 2021
9a8f9c9
Fixture fixup
mzeitlin11 Apr 6, 2021
5ca9c4b
Unmove test
mzeitlin11 Apr 6, 2021
68fd995
Merge remote-tracking branch 'origin/master' into enh/any_all_kleene
mzeitlin11 Apr 6, 2021
6530491
Merge branch 'tst/any_all' into enh/any_all_kleene
mzeitlin11 Apr 6, 2021
26146c2
Add initial tests
mzeitlin11 Apr 6, 2021
20f475d
Add whatsnew, bench
mzeitlin11 Apr 6, 2021
924b38e
Clean up edge case
mzeitlin11 Apr 6, 2021
423f43f
Fix typo
mzeitlin11 Apr 6, 2021
b1408ac
Avoid copy if possible
mzeitlin11 Apr 6, 2021
47ef037
Fix old level test
mzeitlin11 Apr 7, 2021
4415060
Precommit fixup
mzeitlin11 Apr 7, 2021
bb04c1c
Clean up print
mzeitlin11 Apr 7, 2021
9c90886
Merge remote-tracking branch 'origin/master' into enh/any_all_kleene
mzeitlin11 Apr 7, 2021
ef3fbe2
precommit fixup
mzeitlin11 Apr 7, 2021
f4c8a8a
Merge remote-tracking branch 'origin/master' into enh/any_all_kleene
mzeitlin11 Apr 8, 2021
1c3cb7d
Split out test
mzeitlin11 Apr 8, 2021
7cbf85b
Split whatsnew
mzeitlin11 Apr 8, 2021
809b8a4
whatsnew typo
mzeitlin11 Apr 8, 2021
58fd33a
Modify dispatch, add mixed test
mzeitlin11 Apr 8, 2021
80a65bb
Fix post proc check
mzeitlin11 Apr 8, 2021
c9b9d5f
Address review comments
mzeitlin11 Apr 8, 2021
7514568
Merge remote-tracking branch 'origin/master' into enh/any_all_kleene
mzeitlin11 Apr 9, 2021
740ad7b
Merge remote-tracking branch 'origin/master' into enh/any_all_kleene
mzeitlin11 Apr 10, 2021
a116bed
Name arguments better
mzeitlin11 Apr 10, 2021
8a428d4
Use -1 as mask signal
mzeitlin11 Apr 10, 2021
8e3c5be
Consistent typing
mzeitlin11 Apr 10, 2021
b627618
Don't use inspect
mzeitlin11 Apr 10, 2021
23b3b64
precommit fixup
mzeitlin11 Apr 10, 2021
a30496c
Clean up docstring
mzeitlin11 Apr 10, 2021
3051a99
Merge remote-tracking branch 'origin/master' into enh/any_all_kleene
mzeitlin11 Apr 12, 2021
4cd2833
Update doc/source/whatsnew/v1.3.0.rst
mzeitlin11 Apr 13, 2021
98cd401
Update doc/source/whatsnew/v1.3.0.rst
mzeitlin11 Apr 13, 2021
a92c637
Update doc/source/whatsnew/v1.3.0.rst
mzeitlin11 Apr 13, 2021
7c5c8e6
Update pandas/tests/groupby/test_any_all.py
mzeitlin11 Apr 13, 2021
c66d1fd
Update pandas/tests/groupby/test_any_all.py
mzeitlin11 Apr 13, 2021
0950234
Merge branch 'master' into enh/any_all_kleene
mzeitlin11 Apr 13, 2021
c81c1a5
Simplify teasts
mzeitlin11 Apr 13, 2021
d2b8ad0
Merge branch 'enh/any_all_kleene' of github.com:/mzeitlin11/pandas in…
mzeitlin11 Apr 13, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,19 @@ class GroupByCythonAgg:
param_names = ["dtype", "method"]
params = [
["float64"],
["sum", "prod", "min", "max", "mean", "median", "var", "first", "last"],
[
"sum",
"prod",
"min",
"max",
"mean",
"median",
"var",
"first",
"last",
"any",
"all",
],
]

def setup(self, dtype, method):
Expand Down
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,9 @@ Other enhancements
- :class:`RangeIndex` can now be constructed by passing a ``range`` object directly e.g. ``pd.RangeIndex(range(3))`` (:issue:`12067`)
- :meth:`round` being enabled for the nullable integer and floating dtypes (:issue:`38844`)
- :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`)
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
-

.. ---------------------------------------------------------------------------

Expand Down Expand Up @@ -787,6 +790,8 @@ Groupby/resample/rolling
- Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
- Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`)
- Bug in :class:`core.window.RollingGroupby` where ``as_index=False`` argument in ``groupby`` was ignored (:issue:`39433`)
- Bug in :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising ``ValueError`` when using with nullable type columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`)


Reshaping
^^^^^^^^^
Expand Down
33 changes: 25 additions & 8 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -388,40 +388,47 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[intp_t] labels,

@cython.boundscheck(False)
@cython.wraparound(False)
def group_any_all(uint8_t[::1] out,
const uint8_t[::1] values,
def group_any_all(int8_t[::1] out,
const int8_t[::1] values,
const intp_t[:] labels,
const uint8_t[::1] mask,
str val_test,
bint skipna) -> None:
bint skipna,
bint nullable) -> None:
"""
Aggregated boolean values to show truthfulness of group elements.
Aggregated boolean values to show truthfulness of group elements. If the
input is a nullable type (nullable=True), the result will be computed
using Kleene logic.

Parameters
----------
out : np.ndarray[np.uint8]
out : np.ndarray[np.int8]
Values into which this method will write its results.
labels : np.ndarray[np.intp]
Array containing unique label for each group, with its
ordering matching up to the corresponding record in `values`
values : np.ndarray[np.uint8]
values : np.ndarray[np.int8]
Containing the truth value of each element.
mask : np.ndarray[np.uint8]
Indicating whether a value is na or not.
val_test : {'any', 'all'}
String object dictating whether to use any or all truth testing
skipna : bool
Flag to ignore nan values during truth testing
nullable : bool
Whether or not the input is a nullable type. If True, the
result will be computed using Kleene logic

Notes
-----
This method modifies the `out` parameter rather than returning an object.
The returned values will either be 0 or 1 (False or True, respectively).
The returned values will either be 0, 1 (False or True, respectively), or
-1 to signify a masked position in the case of a nullable input.
"""
cdef:
Py_ssize_t i, N = len(labels)
intp_t lab
uint8_t flag_val
int8_t flag_val

if val_test == 'all':
# Because the 'all' value of an empty iterable in Python is True we can
Expand All @@ -444,6 +451,16 @@ def group_any_all(uint8_t[::1] out,
if lab < 0 or (skipna and mask[i]):
continue

if nullable and mask[i]:
# Set the position as masked if `out[lab] != flag_val`, which
# would indicate True/False has not yet been seen for any/all,
# so by Kleene logic the result is currently unknown
if out[lab] != flag_val:
out[lab] = -1
continue

# If True and 'any' or False and 'all', the result is
# already determined
if values[i] == flag_val:
out[lab] = flag_val

Expand Down
35 changes: 29 additions & 6 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ class providing the base-class of operations.
from pandas.core import nanops
import pandas.core.algorithms as algorithms
from pandas.core.arrays import (
BaseMaskedArray,
BooleanArray,
Categorical,
ExtensionArray,
)
Expand Down Expand Up @@ -1413,24 +1415,34 @@ def _bool_agg(self, val_test, skipna):
Shared func to call any / all Cython GroupBy implementations.
"""

def objs_to_bool(vals: np.ndarray) -> tuple[np.ndarray, type]:
def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]:
if is_object_dtype(vals):
vals = np.array([bool(x) for x in vals])
elif isinstance(vals, BaseMaskedArray):
vals = vals._data.astype(bool, copy=False)
else:
vals = vals.astype(bool)

return vals.view(np.uint8), bool
return vals.view(np.int8), bool

def result_to_bool(result: np.ndarray, inference: type) -> np.ndarray:
return result.astype(inference, copy=False)
def result_to_bool(
result: np.ndarray,
inference: type,
nullable: bool = False,
) -> ArrayLike:
if nullable:
return BooleanArray(result.astype(bool, copy=False), result == -1)
else:
return result.astype(inference, copy=False)

return self._get_cythonized_result(
"group_any_all",
aggregate=True,
numeric_only=False,
cython_dtype=np.dtype(np.uint8),
cython_dtype=np.dtype(np.int8),
needs_values=True,
needs_mask=True,
needs_nullable=True,
pre_processing=objs_to_bool,
post_processing=result_to_bool,
val_test=val_test,
Expand Down Expand Up @@ -2613,6 +2625,7 @@ def _get_cythonized_result(
needs_counts: bool = False,
needs_values: bool = False,
needs_2d: bool = False,
needs_nullable: bool = False,
min_count: int | None = None,
needs_mask: bool = False,
needs_ngroups: bool = False,
Expand Down Expand Up @@ -2649,6 +2662,9 @@ def _get_cythonized_result(
signature
needs_ngroups : bool, default False
Whether number of groups is part of the Cython call signature
needs_nullable : bool, default False
Whether a bool specifying if the input is nullable is part
of the Cython call signature
result_is_index : bool, default False
Whether the result of the Cython operation is an index of
values to be retrieved, instead of the actual values themselves
Expand All @@ -2664,7 +2680,8 @@ def _get_cythonized_result(
Function to be applied to result of Cython function. Should accept
an array of values as the first argument and type inferences as its
second argument, i.e. the signature should be
(ndarray, Type).
(ndarray, Type). If `needs_nullable=True`, a third argument should be
`nullable`, to allow for processing specific to nullable values.
**kwargs : dict
Extra arguments to be passed back to Cython funcs

Expand Down Expand Up @@ -2739,6 +2756,12 @@ def _get_cythonized_result(
if needs_ngroups:
func = partial(func, ngroups)

if needs_nullable:
is_nullable = isinstance(values, BaseMaskedArray)
func = partial(func, nullable=is_nullable)
if post_processing:
post_processing = partial(post_processing, nullable=is_nullable)

func(**kwargs) # Call func to modify indexer values in place

if needs_2d:
Expand Down
84 changes: 84 additions & 0 deletions pandas/tests/groupby/test_any_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import numpy as np
import pytest

import pandas as pd
from pandas import (
DataFrame,
Index,
Series,
isna,
)
import pandas._testing as tm
Expand Down Expand Up @@ -68,3 +70,85 @@ def test_bool_aggs_dup_column_labels(bool_agg_func):

expected = df
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize(
"data",
[
[False, False, False],
[True, True, True],
[pd.NA, pd.NA, pd.NA],
[False, pd.NA, False],
[True, pd.NA, True],
[True, pd.NA, False],
],
)
def test_masked_kleene_logic(bool_agg_func, skipna, data):
# GH#37506
ser = Series(data, dtype="boolean")

# The result should match aggregating on the whole series. Correctness
# there is verified in test_reductions.py::test_any_all_boolean_kleene_logic
expected_data = getattr(ser, bool_agg_func)(skipna=skipna)
expected = Series(expected_data, dtype="boolean")

result = ser.groupby([0, 0, 0]).agg(bool_agg_func, skipna=skipna)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"dtype1,dtype2,exp_col1,exp_col2",
[
(
"float",
"Float64",
np.array([True], dtype=bool),
pd.array([pd.NA], dtype="boolean"),
),
(
"Int64",
"float",
pd.array([pd.NA], dtype="boolean"),
np.array([True], dtype=bool),
),
(
"Int64",
"Int64",
pd.array([pd.NA], dtype="boolean"),
pd.array([pd.NA], dtype="boolean"),
),
(
"Float64",
"boolean",
pd.array([pd.NA], dtype="boolean"),
pd.array([pd.NA], dtype="boolean"),
),
],
)
def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2):
# GH#37506
data = [1.0, np.nan]
df = DataFrame(
{"col1": pd.array(data, dtype=dtype1), "col2": pd.array(data, dtype=dtype2)}
)
result = df.groupby([1, 1]).agg("all", skipna=False)

expected = DataFrame({"col1": exp_col1, "col2": exp_col2}, index=[1])
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
@pytest.mark.parametrize("skipna", [True, False])
def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series):
# GH#40585
obj = frame_or_series([pd.NA, 1], dtype=dtype)
expected_res = True
if not skipna and bool_agg_func == "all":
expected_res = pd.NA
expected = frame_or_series([expected_res], index=[1], dtype="boolean")

result = obj.groupby([1, 1]).agg(bool_agg_func, skipna=skipna)
tm.assert_equal(result, expected)
50 changes: 32 additions & 18 deletions pandas/tests/reductions/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -941,31 +941,45 @@ def test_all_any_params(self):
with pytest.raises(NotImplementedError, match=msg):
s.all(bool_only=True)

def test_all_any_boolean(self):
# Check skipna, with boolean type
s1 = Series([pd.NA, True], dtype="boolean")
s2 = Series([pd.NA, False], dtype="boolean")
assert s1.all(skipna=False) is pd.NA # NA && True => NA
assert s1.all(skipna=True)
assert s2.any(skipna=False) is pd.NA # NA || False => NA
assert not s2.any(skipna=True)
@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize(
# expected_data indexed as [[skipna=False/any, skipna=False/all],
# [skipna=True/any, skipna=True/all]]
"data,expected_data",
[
([False, False, False], [[False, False], [False, False]]),
([True, True, True], [[True, True], [True, True]]),
([pd.NA, pd.NA, pd.NA], [[pd.NA, pd.NA], [False, True]]),
([False, pd.NA, False], [[pd.NA, False], [False, False]]),
([True, pd.NA, True], [[True, pd.NA], [True, True]]),
([True, pd.NA, False], [[True, False], [True, False]]),
],
)
def test_any_all_boolean_kleene_logic(
self, bool_agg_func, skipna, data, expected_data
):
ser = Series(data, dtype="boolean")
expected = expected_data[skipna][bool_agg_func == "all"]

# GH-33253: all True / all False values buggy with skipna=False
s3 = Series([True, True], dtype="boolean")
s4 = Series([False, False], dtype="boolean")
assert s3.all(skipna=False)
assert not s4.any(skipna=False)
result = getattr(ser, bool_agg_func)(skipna=skipna)
assert (result is pd.NA and expected is pd.NA) or result == expected

# Check level TODO(GH-33449) result should also be boolean
s = Series(
@pytest.mark.parametrize(
"bool_agg_func,expected",
[("all", [False, True, False]), ("any", [False, True, True])],
)
def test_any_all_boolean_level(self, bool_agg_func, expected):
# GH#33449
ser = Series(
[False, False, True, True, False, True],
index=[0, 0, 1, 1, 2, 2],
dtype="boolean",
)
with tm.assert_produces_warning(FutureWarning):
tm.assert_series_equal(s.all(level=0), Series([False, True, False]))
with tm.assert_produces_warning(FutureWarning):
tm.assert_series_equal(s.any(level=0), Series([False, True, True]))
result = getattr(ser, bool_agg_func)(level=0)
expected = Series(expected, dtype="boolean")
tm.assert_series_equal(result, expected)

def test_any_axis1_bool_only(self):
# GH#32432
Expand Down