pandas-dev · jreback · Apr 13, 2021 · Apr 6, 2021 · Apr 6, 2021 · Apr 6, 2021
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -480,7 +480,19 @@ class GroupByCythonAgg:
     param_names = ["dtype", "method"]
     params = [
         ["float64"],
-        ["sum", "prod", "min", "max", "mean", "median", "var", "first", "last"],
+        [
+            "sum",
+            "prod",
+            "min",
+            "max",
+            "mean",
+            "median",
+            "var",
+            "first",
+            "last",
+            "any",
+            "all",
+        ],
     ]
 
     def setup(self, dtype, method):

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -195,6 +195,9 @@ Other enhancements
 - :class:`RangeIndex` can now be constructed by passing a ``range`` object directly e.g. ``pd.RangeIndex(range(3))`` (:issue:`12067`)
 - :meth:`round` being enabled for the nullable integer and floating dtypes (:issue:`38844`)
 - :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
+- :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` use Kleene logic for methods ``any`` and ``all`` with nullable data types (:issue:`37506`)
+- :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` return a ``BooleanDType`` for methods ``any`` and ``all`` with nullable data types (:issue:`33449`)
+-
 
 .. ---------------------------------------------------------------------------
 
@@ -754,6 +757,7 @@ Groupby/resample/rolling
 - Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes  (:issue:`40164`)
 - :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`)
 - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
+- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` raising ``ValueError`` when using methods ``any`` and ``all`` with nullable type columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`)
 
 Reshaping
 ^^^^^^^^^

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -393,9 +393,11 @@ def group_any_all(uint8_t[::1] out,
                   const intp_t[:] labels,
                   const uint8_t[::1] mask,
                   str val_test,
-                  bint skipna) -> None:
+                  bint skipna,
+                  bint masked) -> None:
     """
-    Aggregated boolean values to show truthfulness of group elements.
+    Aggregated boolean values to show truthfulness of group elements. If the
+    input is a nullable type, Kleene logic will be used.
 
     Parameters
     ----------
@@ -412,16 +414,20 @@ def group_any_all(uint8_t[::1] out,
         String object dictating whether to use any or all truth testing
     skipna : bool
         Flag to ignore nan values during truth testing
+    masked : bool
+        If True, compute the result using Kleene logic
 
     Notes
     -----
     This method modifies the `out` parameter rather than returning an object.
-    The returned values will either be 0 or 1 (False or True, respectively).
+    The returned values will either be 0, 1 (False or True, respectively), or
+    2 to signify a masked position in the case of a nullable input.
     """
     cdef:
         Py_ssize_t i, N = len(labels)
         intp_t lab
         uint8_t flag_val
+        bint use_kleene_logic = masked
 
     if val_test == 'all':
         # Because the 'all' value of an empty iterable in Python is True we can
@@ -444,6 +450,16 @@ def group_any_all(uint8_t[::1] out,
             if lab < 0 or (skipna and mask[i]):
                 continue
 
+            if use_kleene_logic and mask[i]:
+                # Set the position as masked if `out[lab] != flag_val`, which
+                # would indicate True/False has not yet been seen for any/all,
+                # so by Kleene logic the result is currently unknown
+                if out[lab] != flag_val:
+                    out[lab] = 2
+                continue
+
+            # If True and 'any' or False and 'all', the result is
+            # already determined
             if values[i] == flag_val:
                 out[lab] = flag_val
 

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -80,6 +80,8 @@ class providing the base-class of operations.
     Categorical,
     ExtensionArray,
 )
+from pandas.core.arrays.boolean import BooleanArray
+from pandas.core.arrays.masked import BaseMaskedArray
 from pandas.core.base import (
     DataError,
     PandasObject,
@@ -1413,16 +1415,25 @@ def _bool_agg(self, val_test, skipna):
         Shared func to call any / all Cython GroupBy implementations.
         """
 
-        def objs_to_bool(vals: np.ndarray) -> tuple[np.ndarray, type]:
+        def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]:
             if is_object_dtype(vals):
                 vals = np.array([bool(x) for x in vals])
+            elif isinstance(vals, BaseMaskedArray):
+                vals = vals._data.astype(bool, copy=False)
             else:
                 vals = vals.astype(bool)
 
             return vals.view(np.uint8), bool
 
-        def result_to_bool(result: np.ndarray, inference: type) -> np.ndarray:
-            return result.astype(inference, copy=False)
+        def result_to_bool(
+            result: np.ndarray,
+            inference: type,
+            masked: bool = False,
+        ) -> ArrayLike:
+            if masked:
+                return BooleanArray(result.astype(bool, copy=False), result == 2)
+            else:
+                return result.astype(inference, copy=False)
 
         return self._get_cythonized_result(
             "group_any_all",
@@ -1435,6 +1446,7 @@ def result_to_bool(result: np.ndarray, inference: type) -> np.ndarray:
             post_processing=result_to_bool,
             val_test=val_test,
             skipna=skipna,
+            masked=False,
         )
 
     @final
@@ -2663,7 +2675,8 @@ def _get_cythonized_result(
             Function to be applied to result of Cython function. Should accept
             an array of values as the first argument and type inferences as its
             second argument, i.e. the signature should be
-            (ndarray, Type).
+            (ndarray, Type). Optionally, a third argument can be "masked", to
+            allow for processing specific to nullable values
         **kwargs : dict
             Extra arguments to be passed back to Cython funcs
 
@@ -2689,10 +2702,16 @@ def _get_cythonized_result(
         output: dict[base.OutputKey, np.ndarray] = {}
         base_func = getattr(libgroupby, how)
 
+        post_processing_accepts_masked = post_processing is not None and (
+            "masked" in inspect.signature(post_processing).parameters
+        )
+        kwargs_accepts_masked = "masked" in kwargs
+
         error_msg = ""
         for idx, obj in enumerate(self._iterate_slices()):
             name = obj.name
             values = obj._values
+            is_nullable = isinstance(values, BaseMaskedArray)
 
             if numeric_only and not is_numeric_dtype(values):
                 continue
@@ -2738,6 +2757,9 @@ def _get_cythonized_result(
             if needs_ngroups:
                 func = partial(func, ngroups)
 
+            if kwargs_accepts_masked:
+                kwargs["masked"] = is_nullable
+
             func(**kwargs)  # Call func to modify indexer values in place
 
             if needs_2d:
@@ -2747,6 +2769,8 @@ def _get_cythonized_result(
                 result = algorithms.take_nd(values, result)
 
             if post_processing:
+                if post_processing_accepts_masked:
+                    post_processing = partial(post_processing, masked=is_nullable)
                 result = post_processing(result, inferences)
 
             key = base.OutputKey(label=name, position=idx)

diff --git a/pandas/tests/groupby/test_any_all.py b/pandas/tests/groupby/test_any_all.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 
+import pandas as pd
 from pandas import (
     DataFrame,
     Index,
@@ -68,3 +69,83 @@ def test_bool_aggs_dup_column_labels(bool_agg_func):
 
     expected = df
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
+@pytest.mark.parametrize("skipna", [True, False])
+@pytest.mark.parametrize(
+    # expected indexed as [skipna][bool_agg_func == "all"]
+    "data,expected",
+    [
+        ([False, False, False], [[False, False], [False, False]]),
+        ([True, True, True], [[True, True], [True, True]]),
+        ([pd.NA, pd.NA, pd.NA], [[pd.NA, pd.NA], [False, True]]),
+        ([False, pd.NA, False], [[pd.NA, False], [False, False]]),
+        ([True, pd.NA, True], [[True, pd.NA], [True, True]]),
+        ([True, pd.NA, False], [[True, False], [True, False]]),
+    ],
+)
+def test_masked_kleene_logic(bool_agg_func, data, expected, skipna):
+    # GH#37506
+    df = DataFrame(data, dtype="boolean")
+    expected = DataFrame(
+        [expected[skipna][bool_agg_func == "all"]], dtype="boolean", index=[1]
+    )
+
+    result = df.groupby([1, 1, 1]).agg(bool_agg_func, skipna=skipna)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "dtype1,dtype2,exp_col1,exp_col2",
+    [
+        (
+            "float",
+            "Float64",
+            pd.array([True], dtype=bool),
+            pd.array([pd.NA], dtype="boolean"),
+        ),
+        (
+            "Int64",
+            "float",
+            pd.array([pd.NA], dtype="boolean"),
+            pd.array([True], dtype=bool),
+        ),
+        (
+            "Int64",
+            "Int64",
+            pd.array([pd.NA], dtype="boolean"),
+            pd.array([pd.NA], dtype="boolean"),
+        ),
+        (
+            "Float64",
+            "boolean",
+            pd.array([pd.NA], dtype="boolean"),
+            pd.array([pd.NA], dtype="boolean"),
+        ),
+    ],
+)
+def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2):
+    data = [1.0, np.nan]
+    df = DataFrame(
+        {"col1": pd.array(data, dtype=dtype1), "col2": pd.array(data, dtype=dtype2)}
+    )
+    result = df.groupby([1, 1]).agg("all", skipna=False)
+
+    expected = DataFrame({"col1": exp_col1, "col2": exp_col2}, index=[1])
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
+@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
+@pytest.mark.parametrize("skipna", [True, False])
+def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series):
+    # GH#40585
+    obj = frame_or_series([pd.NA, 1], dtype=dtype)
+    expected_res = True
+    if not skipna and bool_agg_func == "all":
+        expected_res = pd.NA
+    expected = frame_or_series([expected_res], index=[1], dtype="boolean")
+
+    result = obj.groupby([1, 1]).agg(bool_agg_func, skipna=skipna)
+    tm.assert_equal(result, expected)
diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
@@ -949,14 +949,20 @@ def test_all_any_boolean(self):
         assert s3.all(skipna=False)
         assert not s4.any(skipna=False)
 
-        # Check level TODO(GH-33449) result should also be boolean
-        s = Series(
+    @pytest.mark.parametrize(
+        "bool_agg_func,expected",
+        [("all", [False, True, False]), ("any", [False, True, True])],
+    )
+    def test_any_all_boolean_level(self, bool_agg_func, expected):
+        # GH#33449
+        ser = Series(
             [False, False, True, True, False, True],
             index=[0, 0, 1, 1, 2, 2],
             dtype="boolean",
         )
-        tm.assert_series_equal(s.all(level=0), Series([False, True, False]))
-        tm.assert_series_equal(s.any(level=0), Series([False, True, True]))
+        result = getattr(ser, bool_agg_func)(level=0)
+        expected = Series(expected, dtype="boolean")
+        tm.assert_series_equal(result, expected)
 
     def test_any_axis1_bool_only(self):
         # GH#32432