From 0c29b78d841187f49600f446c16a922cf73e5ffd Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 24 Mar 2021 17:16:01 -0400 Subject: [PATCH 1/8] WIP --- pandas/core/groupby/groupby.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1f79823746a87..041251ba01662 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -14,6 +14,7 @@ class providing the base-class of operations. partial, wraps, ) +from pandas.core.arrays.boolean import BooleanDtype import inspect from textwrap import dedent import types @@ -109,6 +110,10 @@ class providing the base-class of operations. from pandas.core.sorting import get_group_index_sorter from pandas.core.util.numba_ import NUMBA_FUNC_CACHE +from pandas.core.dtypes.base import ( + ExtensionDtype, + +) _common_see_also = """ See Also -------- @@ -1416,15 +1421,19 @@ def _obj_1d_constructor(self) -> Type[Series]: return self.obj._constructor @final - def _bool_agg(self, val_test, skipna): + def _bool_agg(self, val_test, skipna: bool): """ Shared func to call any / all Cython GroupBy implementations. """ - def objs_to_bool(vals: np.ndarray) -> Tuple[np.ndarray, Type]: + def objs_to_bool(vals: ArrayLike) -> Tuple[ArrayLike, Type]: + # dtype = bool if is_object_dtype(vals): - vals = np.array([bool(x) for x in vals]) + vals = np.array([bool(x) if notna(x) else True for x in vals]) else: + if isinstance(vals, ExtensionArray): + vals = vals.to_numpy(dtype=bool, na_value=np.nan) + vals = vals.astype(bool) return vals.view(np.uint8), bool From b21258d08bc51b35e0d4651f4936b761561113d7 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 24 Mar 2021 19:16:57 -0400 Subject: [PATCH 2/8] BUG: groupby any/all raising on extension types --- doc/source/whatsnew/v1.3.0.rst | 2 ++ pandas/core/groupby/groupby.py | 5 ++--- pandas/tests/groupby/test_groupby.py | 13 +++++++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 512e6e6cbb391..55007a537154a 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -633,6 +633,8 @@ Groupby/resample/rolling - Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes (:issue:`40164`) - :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`) - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`) +- Bug in :meth:`DataFrameGroupBy.any` and :meth:`DataFrameGroupBy.all` raising ``ValueError`` with ``ExtensionDType`` columns holding ``NA`` even with ``skipna=False`` (:issue:`40585`) +- Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 041251ba01662..0b2453d6008d2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1426,10 +1426,9 @@ def _bool_agg(self, val_test, skipna: bool): Shared func to call any / all Cython GroupBy implementations. """ - def objs_to_bool(vals: ArrayLike) -> Tuple[ArrayLike, Type]: - # dtype = bool + def objs_to_bool(vals: ArrayLike) -> Tuple[np.ndarray, Type]: if is_object_dtype(vals): - vals = np.array([bool(x) if notna(x) else True for x in vals]) + vals = np.array([bool(x) for x in vals]) else: if isinstance(vals, ExtensionArray): vals = vals.to_numpy(dtype=bool, na_value=np.nan) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 12247e2445295..7f7739019c7a8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1994,6 +1994,19 @@ def test_bool_aggs_dup_column_labels(bool_agg_func): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) +def test_bool_aggs_ea_skipna(bool_agg_func, dtype): + # GH-40585 + df = pd.DataFrame({"grp": [1, 1], "val": [pd.NA, 1]}, dtype=dtype) + grouped = df.groupby("grp") + result = grouped.agg(bool_agg_func, skipna=True) + + # Avoiding asserting frame equality because the index becomes + # object type + assert bool(result["val"].iloc[0]) is True + + @pytest.mark.parametrize( "idx", [Index(["a", "a"]), MultiIndex.from_tuples((("a", "a"), ("a", "a")))] ) From d86a81d49e460b92064166ea0997d1a8affb0c7b Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 24 Mar 2021 19:20:21 -0400 Subject: [PATCH 3/8] precommit fixup --- pandas/core/groupby/groupby.py | 5 ----- pandas/tests/groupby/test_groupby.py | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0b2453d6008d2..09422346f22dc 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -14,7 +14,6 @@ class providing the base-class of operations. partial, wraps, ) -from pandas.core.arrays.boolean import BooleanDtype import inspect from textwrap import dedent import types @@ -110,10 +109,6 @@ class providing the base-class of operations. from pandas.core.sorting import get_group_index_sorter from pandas.core.util.numba_ import NUMBA_FUNC_CACHE -from pandas.core.dtypes.base import ( - ExtensionDtype, - -) _common_see_also = """ See Also -------- diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7f7739019c7a8..53ba717f21c21 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1998,7 +1998,7 @@ def test_bool_aggs_dup_column_labels(bool_agg_func): @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) def test_bool_aggs_ea_skipna(bool_agg_func, dtype): # GH-40585 - df = pd.DataFrame({"grp": [1, 1], "val": [pd.NA, 1]}, dtype=dtype) + df = DataFrame({"grp": [1, 1], "val": [pd.NA, 1]}, dtype=dtype) grouped = df.groupby("grp") result = grouped.agg(bool_agg_func, skipna=True) From df672daf72eeaf3c911953bf5732d0ea3b772223 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 24 Mar 2021 19:21:56 -0400 Subject: [PATCH 4/8] Fix whatsnew typo --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 55007a537154a..f63d63de88849 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -633,7 +633,7 @@ Groupby/resample/rolling - Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes (:issue:`40164`) - :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`) - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`) -- Bug in :meth:`DataFrameGroupBy.any` and :meth:`DataFrameGroupBy.all` raising ``ValueError`` with ``ExtensionDType`` columns holding ``NA`` even with ``skipna=False`` (:issue:`40585`) +- Bug in :meth:`DataFrameGroupBy.any` and :meth:`DataFrameGroupBy.all` raising ``ValueError`` with ``ExtensionDType`` columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`) - Reshaping From 99ed6e36138140ffc31dfba861705edbd9ef610e Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 25 Mar 2021 12:38:44 -0400 Subject: [PATCH 5/8] Use elif, add series test --- pandas/core/groupby/groupby.py | 6 +++--- pandas/tests/groupby/test_function.py | 28 +++++++++++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 24 ----------------------- 3 files changed, 31 insertions(+), 27 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index dc264deb9f53c..0ba5fdfc5e111 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1424,10 +1424,10 @@ def _bool_agg(self, val_test, skipna: bool): def objs_to_bool(vals: ArrayLike) -> Tuple[np.ndarray, Type]: if is_object_dtype(vals): vals = np.array([bool(x) for x in vals]) + elif isinstance(vals, ExtensionArray): + vals = vals.to_numpy(dtype=bool, na_value=np.nan) + vals = vals.astype(bool) else: - if isinstance(vals, ExtensionArray): - vals = vals.to_numpy(dtype=bool, na_value=np.nan) - vals = vals.astype(bool) return vals.view(np.uint8), bool diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 515774eae009b..0f386cf7e8dea 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -76,6 +76,34 @@ def test_groupby_bool_aggs(agg_func, skipna, vals): tm.assert_frame_equal(result, exp_df) +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +def test_bool_aggs_dup_column_labels(bool_agg_func): + # 21668 + df = DataFrame([[True, True]], columns=["a", "a"]) + grp_by = df.groupby([0]) + result = getattr(grp_by, bool_agg_func)() + + expected = df + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) +@pytest.mark.parametrize("group_by_frame", [True, False]) +def test_bool_aggs_ea_skipna(bool_agg_func, dtype, group_by_frame): + # GH-40585 + df = DataFrame({"grp": [1, 1], "val": pd.array([pd.NA, 1], dtype=dtype)}) + if group_by_frame: + grouped = df.groupby("grp") + expected = DataFrame({"val": [True]}, index=pd.Index([1], name="grp")) + else: + grouped = df["val"].groupby(df["grp"]) + expected = Series([True], index=pd.Index([1], name="grp"), name="val") + + result = grouped.agg(bool_agg_func, skipna=True) + tm.assert_equal(result, expected) + + def test_max_min_non_numeric(): # #2700 aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]}) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 53ba717f21c21..53e30b477880c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1983,30 +1983,6 @@ def test_groupby_duplicate_index(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -def test_bool_aggs_dup_column_labels(bool_agg_func): - # 21668 - df = DataFrame([[True, True]], columns=["a", "a"]) - grp_by = df.groupby([0]) - result = getattr(grp_by, bool_agg_func)() - - expected = df - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) -def test_bool_aggs_ea_skipna(bool_agg_func, dtype): - # GH-40585 - df = DataFrame({"grp": [1, 1], "val": [pd.NA, 1]}, dtype=dtype) - grouped = df.groupby("grp") - result = grouped.agg(bool_agg_func, skipna=True) - - # Avoiding asserting frame equality because the index becomes - # object type - assert bool(result["val"].iloc[0]) is True - - @pytest.mark.parametrize( "idx", [Index(["a", "a"]), MultiIndex.from_tuples((("a", "a"), ("a", "a")))] ) From 5d7bd1c48e5f765e78c60be432ccffee3b45f29d Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 25 Mar 2021 12:48:22 -0400 Subject: [PATCH 6/8] precommit fixup --- pandas/tests/groupby/test_function.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 0f386cf7e8dea..b0540b84d34c1 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -95,10 +95,10 @@ def test_bool_aggs_ea_skipna(bool_agg_func, dtype, group_by_frame): df = DataFrame({"grp": [1, 1], "val": pd.array([pd.NA, 1], dtype=dtype)}) if group_by_frame: grouped = df.groupby("grp") - expected = DataFrame({"val": [True]}, index=pd.Index([1], name="grp")) + expected = DataFrame({"val": [True]}, index=Index([1], name="grp")) else: grouped = df["val"].groupby(df["grp"]) - expected = Series([True], index=pd.Index([1], name="grp"), name="val") + expected = Series([True], index=Index([1], name="grp"), name="val") result = grouped.agg(bool_agg_func, skipna=True) tm.assert_equal(result, expected) From 251d38674f1ec7dc7e34c4c4db86ffb1dd665ad0 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 25 Mar 2021 12:50:46 -0400 Subject: [PATCH 7/8] Amend whatsnew for series groupby --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index f63d63de88849..c6daa53b975ea 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -633,7 +633,7 @@ Groupby/resample/rolling - Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes (:issue:`40164`) - :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`) - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`) -- Bug in :meth:`DataFrameGroupBy.any` and :meth:`DataFrameGroupBy.all` raising ``ValueError`` with ``ExtensionDType`` columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`) +- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` raising ``ValueError`` when using methods ``any`` and ``all`` with ``ExtensionDType`` columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`) - Reshaping From 50ca7647247b888aba258b92e31dcf51cbbcb508 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 2 Apr 2021 17:47:35 -0400 Subject: [PATCH 8/8] Add skipna parameterization --- pandas/tests/groupby/test_function.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index b0540b84d34c1..e16d2e0f2dd0a 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -77,11 +77,12 @@ def test_groupby_bool_aggs(agg_func, skipna, vals): @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -def test_bool_aggs_dup_column_labels(bool_agg_func): +@pytest.mark.parametrize("skipna", [True, False]) +def test_bool_aggs_dup_column_labels(bool_agg_func, skipna): # 21668 df = DataFrame([[True, True]], columns=["a", "a"]) grp_by = df.groupby([0]) - result = getattr(grp_by, bool_agg_func)() + result = getattr(grp_by, bool_agg_func)(skipna=skipna) expected = df tm.assert_frame_equal(result, expected)