Skip to content

ENH/BUG: Use Kleene logic for groupby any/all #40819

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 40 commits into from
Apr 13, 2021
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
088ca14
WIP
mzeitlin11 Apr 6, 2021
2554921
TSTS: Consolidate groupby any, all
mzeitlin11 Apr 6, 2021
9a8f9c9
Fixture fixup
mzeitlin11 Apr 6, 2021
5ca9c4b
Unmove test
mzeitlin11 Apr 6, 2021
68fd995
Merge remote-tracking branch 'origin/master' into enh/any_all_kleene
mzeitlin11 Apr 6, 2021
6530491
Merge branch 'tst/any_all' into enh/any_all_kleene
mzeitlin11 Apr 6, 2021
26146c2
Add initial tests
mzeitlin11 Apr 6, 2021
20f475d
Add whatsnew, bench
mzeitlin11 Apr 6, 2021
924b38e
Clean up edge case
mzeitlin11 Apr 6, 2021
423f43f
Fix typo
mzeitlin11 Apr 6, 2021
b1408ac
Avoid copy if possible
mzeitlin11 Apr 6, 2021
47ef037
Fix old level test
mzeitlin11 Apr 7, 2021
4415060
Precommit fixup
mzeitlin11 Apr 7, 2021
bb04c1c
Clean up print
mzeitlin11 Apr 7, 2021
9c90886
Merge remote-tracking branch 'origin/master' into enh/any_all_kleene
mzeitlin11 Apr 7, 2021
ef3fbe2
precommit fixup
mzeitlin11 Apr 7, 2021
f4c8a8a
Merge remote-tracking branch 'origin/master' into enh/any_all_kleene
mzeitlin11 Apr 8, 2021
1c3cb7d
Split out test
mzeitlin11 Apr 8, 2021
7cbf85b
Split whatsnew
mzeitlin11 Apr 8, 2021
809b8a4
whatsnew typo
mzeitlin11 Apr 8, 2021
58fd33a
Modify dispatch, add mixed test
mzeitlin11 Apr 8, 2021
80a65bb
Fix post proc check
mzeitlin11 Apr 8, 2021
c9b9d5f
Address review comments
mzeitlin11 Apr 8, 2021
7514568
Merge remote-tracking branch 'origin/master' into enh/any_all_kleene
mzeitlin11 Apr 9, 2021
740ad7b
Merge remote-tracking branch 'origin/master' into enh/any_all_kleene
mzeitlin11 Apr 10, 2021
a116bed
Name arguments better
mzeitlin11 Apr 10, 2021
8a428d4
Use -1 as mask signal
mzeitlin11 Apr 10, 2021
8e3c5be
Consistent typing
mzeitlin11 Apr 10, 2021
b627618
Don't use inspect
mzeitlin11 Apr 10, 2021
23b3b64
precommit fixup
mzeitlin11 Apr 10, 2021
a30496c
Clean up docstring
mzeitlin11 Apr 10, 2021
3051a99
Merge remote-tracking branch 'origin/master' into enh/any_all_kleene
mzeitlin11 Apr 12, 2021
4cd2833
Update doc/source/whatsnew/v1.3.0.rst
mzeitlin11 Apr 13, 2021
98cd401
Update doc/source/whatsnew/v1.3.0.rst
mzeitlin11 Apr 13, 2021
a92c637
Update doc/source/whatsnew/v1.3.0.rst
mzeitlin11 Apr 13, 2021
7c5c8e6
Update pandas/tests/groupby/test_any_all.py
mzeitlin11 Apr 13, 2021
c66d1fd
Update pandas/tests/groupby/test_any_all.py
mzeitlin11 Apr 13, 2021
0950234
Merge branch 'master' into enh/any_all_kleene
mzeitlin11 Apr 13, 2021
c81c1a5
Simplify teasts
mzeitlin11 Apr 13, 2021
d2b8ad0
Merge branch 'enh/any_all_kleene' of github.com:/mzeitlin11/pandas in…
mzeitlin11 Apr 13, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,19 @@ class GroupByCythonAgg:
param_names = ["dtype", "method"]
params = [
["float64"],
["sum", "prod", "min", "max", "mean", "median", "var", "first", "last"],
[
"sum",
"prod",
"min",
"max",
"mean",
"median",
"var",
"first",
"last",
"any",
"all",
],
]

def setup(self, dtype, method):
Expand Down
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,9 @@ Other enhancements
- :class:`RangeIndex` can now be constructed by passing a ``range`` object directly e.g. ``pd.RangeIndex(range(3))`` (:issue:`12067`)
- :meth:`round` being enabled for the nullable integer and floating dtypes (:issue:`38844`)
- :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
- :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` use Kleene logic for methods ``any`` and ``all`` with nullable data types (:issue:`37506`)
- :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` return a ``BooleanDType`` for methods ``any`` and ``all`` with nullable data types (:issue:`33449`)
-

.. ---------------------------------------------------------------------------

Expand Down Expand Up @@ -754,6 +757,7 @@ Groupby/resample/rolling
- Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes (:issue:`40164`)
- :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`)
- Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` raising ``ValueError`` when using methods ``any`` and ``all`` with nullable type columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`)

Reshaping
^^^^^^^^^
Expand Down
22 changes: 19 additions & 3 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -393,9 +393,11 @@ def group_any_all(uint8_t[::1] out,
const intp_t[:] labels,
const uint8_t[::1] mask,
str val_test,
bint skipna) -> None:
bint skipna,
bint masked) -> None:
"""
Aggregated boolean values to show truthfulness of group elements.
Aggregated boolean values to show truthfulness of group elements. If the
input is a nullable type, Kleene logic will be used.

Parameters
----------
Expand All @@ -412,16 +414,20 @@ def group_any_all(uint8_t[::1] out,
String object dictating whether to use any or all truth testing
skipna : bool
Flag to ignore nan values during truth testing
masked : bool
If True, compute the result using Kleene logic

Notes
-----
This method modifies the `out` parameter rather than returning an object.
The returned values will either be 0 or 1 (False or True, respectively).
The returned values will either be 0, 1 (False or True, respectively), or
2 to signify a masked position in the case of a nullable input.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm is this what we do elsewhere to encode Kleene logic? (e.g. would rather use -1 on nulls)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The other algo is a bit different since it can be vectorized and computes both data and mask (so doesn't need to store something signifying NULL). Agree that -1 is more intuitive, have changed to use -1 as the mask signal

Copy link
Member

@jorisvandenbossche jorisvandenbossche Apr 10, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At some point, it might be interesting to look into directly filling in the mask as well instead of using the temporary -1 (but not for this PR to be clear ;)).

Personally, I would maybe leave it at using 2, so you don't need to change the uint8 type

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep I don't think that would be too bad, that's how I had approached it originally, but changes were slightly more invasive, so stuck with this.

For the -1 vs 2 question, I think I slightly prefer -1 because it seems more immediately apparent what it represents. Although it forces changing the dtype, still the same number of bytes, so memory layout is unaffected and constructing the view looks equivalent speed-wise:

In [1]: import numpy as np

In [2]: arr = np.array([True, False, True])

In [3]: %timeit arr.view('uint8')
332 ns ± 2.84 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)

In [4]: %timeit arr.view('int8')
334 ns ± 3.78 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)

But not a strong opinion, fine with using 2 as well :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If speed-wise the int8 vs uint8 doesn't matter, then indeed the -1 is fine as well

"""
cdef:
Py_ssize_t i, N = len(labels)
intp_t lab
uint8_t flag_val
bint use_kleene_logic = masked

if val_test == 'all':
# Because the 'all' value of an empty iterable in Python is True we can
Expand All @@ -444,6 +450,16 @@ def group_any_all(uint8_t[::1] out,
if lab < 0 or (skipna and mask[i]):
continue

if use_kleene_logic and mask[i]:
# Set the position as masked if `out[lab] != flag_val`, which
# would indicate True/False has not yet been seen for any/all,
# so by Kleene logic the result is currently unknown
if out[lab] != flag_val:
out[lab] = 2
continue

# If True and 'any' or False and 'all', the result is
# already determined
if values[i] == flag_val:
out[lab] = flag_val

Expand Down
32 changes: 28 additions & 4 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ class providing the base-class of operations.
Categorical,
ExtensionArray,
)
from pandas.core.arrays.boolean import BooleanArray
from pandas.core.arrays.masked import BaseMaskedArray
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can import both from from pandas.core.arrays import ..

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks!

from pandas.core.base import (
DataError,
PandasObject,
Expand Down Expand Up @@ -1413,16 +1415,25 @@ def _bool_agg(self, val_test, skipna):
Shared func to call any / all Cython GroupBy implementations.
"""

def objs_to_bool(vals: np.ndarray) -> tuple[np.ndarray, type]:
def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]:
if is_object_dtype(vals):
vals = np.array([bool(x) for x in vals])
elif isinstance(vals, BaseMaskedArray):
vals = vals._data.astype(bool, copy=False)
else:
vals = vals.astype(bool)

return vals.view(np.uint8), bool

def result_to_bool(result: np.ndarray, inference: type) -> np.ndarray:
return result.astype(inference, copy=False)
def result_to_bool(
result: np.ndarray,
inference: type,
masked: bool = False,
) -> ArrayLike:
if masked:
return BooleanArray(result.astype(bool, copy=False), result == 2)
else:
return result.astype(inference, copy=False)

return self._get_cythonized_result(
"group_any_all",
Expand All @@ -1435,6 +1446,7 @@ def result_to_bool(result: np.ndarray, inference: type) -> np.ndarray:
post_processing=result_to_bool,
val_test=val_test,
skipna=skipna,
masked=False,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this set to always be False?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Had originally used masked being included in kwargs as a signal for whether to pass masked into the groupby func. But the original value here wasn't relevant since it gets set before being used.

This is definitely confusing and a poor solution, changed to instead use a positional arg

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, the update looks clearer to follow now

)

@final
Expand Down Expand Up @@ -2663,7 +2675,8 @@ def _get_cythonized_result(
Function to be applied to result of Cython function. Should accept
an array of values as the first argument and type inferences as its
second argument, i.e. the signature should be
(ndarray, Type).
(ndarray, Type). Optionally, a third argument can be "masked", to
allow for processing specific to nullable values
**kwargs : dict
Extra arguments to be passed back to Cython funcs

Expand All @@ -2689,10 +2702,16 @@ def _get_cythonized_result(
output: dict[base.OutputKey, np.ndarray] = {}
base_func = getattr(libgroupby, how)

post_processing_accepts_masked = post_processing is not None and (
"masked" in inspect.signature(post_processing).parameters
)
kwargs_accepts_masked = "masked" in kwargs

error_msg = ""
for idx, obj in enumerate(self._iterate_slices()):
name = obj.name
values = obj._values
is_nullable = isinstance(values, BaseMaskedArray)

if numeric_only and not is_numeric_dtype(values):
continue
Expand Down Expand Up @@ -2738,6 +2757,9 @@ def _get_cythonized_result(
if needs_ngroups:
func = partial(func, ngroups)

if kwargs_accepts_masked:
kwargs["masked"] = is_nullable

func(**kwargs) # Call func to modify indexer values in place

if needs_2d:
Expand All @@ -2747,6 +2769,8 @@ def _get_cythonized_result(
result = algorithms.take_nd(values, result)

if post_processing:
if post_processing_accepts_masked:
post_processing = partial(post_processing, masked=is_nullable)
result = post_processing(result, inferences)

key = base.OutputKey(label=name, position=idx)
Expand Down
81 changes: 81 additions & 0 deletions pandas/tests/groupby/test_any_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import pytest

import pandas as pd
from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -68,3 +69,83 @@ def test_bool_aggs_dup_column_labels(bool_agg_func):

expected = df
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize(
# expected indexed as [skipna][bool_agg_func == "all"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you maybe also write out what that results in practice? So [skipna=False/any, skipna=False / all], [skipna=True / any, skipna=True / all], if I understand correctly

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep have changed, that is a much clearer way to describe it

"data,expected",
[
([False, False, False], [[False, False], [False, False]]),
([True, True, True], [[True, True], [True, True]]),
([pd.NA, pd.NA, pd.NA], [[pd.NA, pd.NA], [False, True]]),
([False, pd.NA, False], [[pd.NA, False], [False, False]]),
([True, pd.NA, True], [[True, pd.NA], [True, True]]),
([True, pd.NA, False], [[True, False], [True, False]]),
],
)
def test_masked_kleene_logic(bool_agg_func, data, expected, skipna):
# GH#37506
df = DataFrame(data, dtype="boolean")
expected = DataFrame(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this expected is really hard to parse can you do in multiple steps / make simpler

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressing your comment above makes this simpler, let me know if you think any piece is still confusing

[expected[skipna][bool_agg_func == "all"]], dtype="boolean", index=[1]
)

result = df.groupby([1, 1, 1]).agg(bool_agg_func, skipna=skipna)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"dtype1,dtype2,exp_col1,exp_col2",
[
(
"float",
"Float64",
pd.array([True], dtype=bool),
pd.array([pd.NA], dtype="boolean"),
),
(
"Int64",
"float",
pd.array([pd.NA], dtype="boolean"),
pd.array([True], dtype=bool),
),
(
"Int64",
"Int64",
pd.array([pd.NA], dtype="boolean"),
pd.array([pd.NA], dtype="boolean"),
),
(
"Float64",
"boolean",
pd.array([pd.NA], dtype="boolean"),
pd.array([pd.NA], dtype="boolean"),
),
],
)
def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2):
data = [1.0, np.nan]
df = DataFrame(
{"col1": pd.array(data, dtype=dtype1), "col2": pd.array(data, dtype=dtype2)}
)
result = df.groupby([1, 1]).agg("all", skipna=False)

expected = DataFrame({"col1": exp_col1, "col2": exp_col2}, index=[1])
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
@pytest.mark.parametrize("skipna", [True, False])
def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series):
# GH#40585
obj = frame_or_series([pd.NA, 1], dtype=dtype)
expected_res = True
if not skipna and bool_agg_func == "all":
expected_res = pd.NA
expected = frame_or_series([expected_res], index=[1], dtype="boolean")

result = obj.groupby([1, 1]).agg(bool_agg_func, skipna=skipna)
tm.assert_equal(result, expected)
14 changes: 10 additions & 4 deletions pandas/tests/reductions/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -949,14 +949,20 @@ def test_all_any_boolean(self):
assert s3.all(skipna=False)
assert not s4.any(skipna=False)

# Check level TODO(GH-33449) result should also be boolean
s = Series(
@pytest.mark.parametrize(
"bool_agg_func,expected",
[("all", [False, True, False]), ("any", [False, True, True])],
)
def test_any_all_boolean_level(self, bool_agg_func, expected):
# GH#33449
ser = Series(
[False, False, True, True, False, True],
index=[0, 0, 1, 1, 2, 2],
dtype="boolean",
)
tm.assert_series_equal(s.all(level=0), Series([False, True, False]))
tm.assert_series_equal(s.any(level=0), Series([False, True, True]))
result = getattr(ser, bool_agg_func)(level=0)
expected = Series(expected, dtype="boolean")
tm.assert_series_equal(result, expected)

def test_any_axis1_bool_only(self):
# GH#32432
Expand Down