diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 8d9e1e9958d23..3774ed2f32c13 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -268,6 +268,7 @@ Other enhancements - :meth:`read_csv` and :meth:`read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`) - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`) - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`) +- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising with ``object`` data containing ``pd.NA`` even when ``skipna=True`` (:issue:`37501`) - :meth:`.GroupBy.rank` now supports object-dtype data (:issue:`38278`) - Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`) - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f694dcce809ea..0080791a51a4b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1519,7 +1519,11 @@ def _bool_agg(self, val_test, skipna): def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]: if is_object_dtype(vals): - vals = np.array([bool(x) for x in vals]) + # GH#37501: don't raise on pd.NA when skipna=True + if skipna: + vals = np.array([bool(x) if not isna(x) else True for x in vals]) + else: + vals = np.array([bool(x) for x in vals]) elif isinstance(vals, BaseMaskedArray): vals = vals._data.astype(bool, copy=False) else: diff --git a/pandas/tests/groupby/test_any_all.py b/pandas/tests/groupby/test_any_all.py index d4f80aa0e51d4..13232d454a48c 100644 --- a/pandas/tests/groupby/test_any_all.py +++ b/pandas/tests/groupby/test_any_all.py @@ -152,3 +152,29 @@ def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series): result = obj.groupby([1, 1]).agg(bool_agg_func, skipna=skipna) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "bool_agg_func,data,expected_res", + [ + ("any", [pd.NA, np.nan], False), + ("any", [pd.NA, 1, np.nan], True), + ("all", [pd.NA, pd.NaT], True), + ("all", [pd.NA, False, pd.NaT], False), + ], +) +def test_object_type_missing_vals(bool_agg_func, data, expected_res, frame_or_series): + # GH#37501 + obj = frame_or_series(data, dtype=object) + result = obj.groupby([1] * len(data)).agg(bool_agg_func) + expected = frame_or_series([expected_res], index=[1], dtype="bool") + tm.assert_equal(result, expected) + + +@pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning") +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +def test_object_NA_raises_with_skipna_false(bool_agg_func): + # GH#37501 + ser = Series([pd.NA], dtype=object) + with pytest.raises(TypeError, match="boolean value of NA is ambiguous"): + ser.groupby([1]).agg(bool_agg_func, skipna=False)