diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 5e95cd6e5ee10..40ae4099532d0 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -753,6 +753,8 @@ Groupby/resample/rolling - Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes (:issue:`40164`) - :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`) - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`) +- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` raising ``ValueError`` when using methods ``any`` and ``all`` with ``ExtensionDType`` columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`) +- Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a6c3cb3ff5d0b..bbb35c83fc6fa 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1414,14 +1414,17 @@ def _obj_1d_constructor(self) -> Type[Series]: return self.obj._constructor @final - def _bool_agg(self, val_test, skipna): + def _bool_agg(self, val_test, skipna: bool): """ Shared func to call any / all Cython GroupBy implementations. """ - def objs_to_bool(vals: np.ndarray) -> Tuple[np.ndarray, Type]: + def objs_to_bool(vals: ArrayLike) -> Tuple[np.ndarray, Type]: if is_object_dtype(vals): vals = np.array([bool(x) for x in vals]) + elif isinstance(vals, ExtensionArray): + vals = vals.to_numpy(dtype=bool, na_value=np.nan) + vals = vals.astype(bool) else: vals = vals.astype(bool) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 515774eae009b..e16d2e0f2dd0a 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -76,6 +76,35 @@ def test_groupby_bool_aggs(agg_func, skipna, vals): tm.assert_frame_equal(result, exp_df) +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +@pytest.mark.parametrize("skipna", [True, False]) +def test_bool_aggs_dup_column_labels(bool_agg_func, skipna): + # 21668 + df = DataFrame([[True, True]], columns=["a", "a"]) + grp_by = df.groupby([0]) + result = getattr(grp_by, bool_agg_func)(skipna=skipna) + + expected = df + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) +@pytest.mark.parametrize("group_by_frame", [True, False]) +def test_bool_aggs_ea_skipna(bool_agg_func, dtype, group_by_frame): + # GH-40585 + df = DataFrame({"grp": [1, 1], "val": pd.array([pd.NA, 1], dtype=dtype)}) + if group_by_frame: + grouped = df.groupby("grp") + expected = DataFrame({"val": [True]}, index=Index([1], name="grp")) + else: + grouped = df["val"].groupby(df["grp"]) + expected = Series([True], index=Index([1], name="grp"), name="val") + + result = grouped.agg(bool_agg_func, skipna=True) + tm.assert_equal(result, expected) + + def test_max_min_non_numeric(): # #2700 aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]}) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index de508b8cd78ec..6c51e32fa9a78 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1978,17 +1978,6 @@ def test_groupby_duplicate_index(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -def test_bool_aggs_dup_column_labels(bool_agg_func): - # 21668 - df = DataFrame([[True, True]], columns=["a", "a"]) - grp_by = df.groupby([0]) - result = getattr(grp_by, bool_agg_func)() - - expected = df - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( "idx", [Index(["a", "a"]), MultiIndex.from_tuples((("a", "a"), ("a", "a")))] )