From 50e8ef90885a98dade1ae743317fccab3b6af54c Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 27 Jan 2023 16:49:02 -0800 Subject: [PATCH 1/4] BUG: GroupBy.min/max with unordered Categorical and no groups --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/groupby/ops.py | 15 +++--- pandas/tests/groupby/test_groupby.py | 81 +++++++++++++--------------- 3 files changed, 45 insertions(+), 52 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index c1d9b2744b27e..c47740bf761bd 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -955,6 +955,7 @@ Categorical - Bug in :meth:`Series.replace` with categorical dtype losing nullable dtypes of underlying categories (:issue:`49404`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`) - Bug in :class:`Categorical` constructor when constructing from a :class:`Categorical` object and ``dtype="category"`` losing ordered-ness (:issue:`49309`) +- Bug in :meth:`SeriesGroupBy.min`, :meth:`SeriesGroupBy.max`, :meth:`DataFrameGroupBy.min`, and :meth:`DataFrameGroupBy.max` with unordered :class:`CategoricalDtype` with no groups failing to raise ``TypeError`` (:issue:`??`) - Datetimelike diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index f88236b2464c1..94f396ccef6b1 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -226,9 +226,10 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): Raises ------ + TypeError + This is not a valid operation for this dtype. NotImplementedError - This is either not a valid function for this dtype, or - valid but not implemented in cython. + This may be a valid operation, but does not have a cython implementation. """ how = self.how @@ -237,16 +238,16 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): return if isinstance(dtype, CategoricalDtype): - # NotImplementedError for methods that can fall back to a - # non-cython implementation. if how in ["sum", "prod", "cumsum", "cumprod"]: raise TypeError(f"{dtype} type does not support {how} operations") + if how in ["min", "max", "rank"] and not dtype.ordered: + # raise TypeError instead of NotImplementedError to ensure we + # don't go down a group-by-group path, since in the empty-groups + # case that would fail to raise + raise TypeError(f"Cannot perform {how} with non-ordered Categorical") if how not in ["rank"]: # only "rank" is implemented in cython raise NotImplementedError(f"{dtype} dtype not supported") - if not dtype.ordered: - # TODO: TypeError? - raise NotImplementedError(f"{dtype} dtype not supported") elif is_sparse(dtype): # categoricals are only 1d, so we diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a1c1930c2e11b..988b8a67babff 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1901,25 +1901,12 @@ def test_empty_groupby( raises=ValueError, match="attempt to get arg(min|max) of an empty sequence" ) request.node.add_marker(mark) - elif ( - isinstance(values, Categorical) - and len(keys) == 1 - and not isinstance(columns, list) - ): - mark = pytest.mark.xfail( - raises=TypeError, match="'Categorical' does not implement" - ) - request.node.add_marker(mark) elif isinstance(values, Categorical) and len(keys) == 1 and op in ["sum", "prod"]: mark = pytest.mark.xfail( raises=AssertionError, match="(DataFrame|Series) are different" ) request.node.add_marker(mark) - elif ( - isinstance(values, Categorical) - and len(keys) == 2 - and op in ["min", "max", "sum"] - ): + elif isinstance(values, Categorical) and len(keys) == 2 and op in ["sum"]: mark = pytest.mark.xfail( raises=AssertionError, match="(DataFrame|Series) are different" ) @@ -1949,6 +1936,31 @@ def get_result(**kwargs): else: return getattr(gb, method)(op, **kwargs) + if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]: + msg = f"Cannot perform {op} with non-ordered Categorical" + with pytest.raises(TypeError, match=msg): + get_result() + + if isinstance(columns, list): + # i.e. DataframeGroupBy, not SeriesGroupBy + result = get_result(numeric_only=True) + + # Categorical is special without 'observed=True', we get an NaN entry + # corresponding to the unobserved group. If we passed observed=True + # to groupby, expected would just be 'df.set_index(keys)[columns]' + # as below + lev = Categorical([0], dtype=values.dtype) + if len(keys) != 1: + idx = MultiIndex.from_product([lev, lev], names=keys) + else: + # all columns are dropped, but we end up with one row + # Categorical is special without 'observed=True' + idx = Index(lev, name=keys[0]) + + expected = DataFrame([], columns=[], index=idx) + tm.assert_equal(result, expected) + return + if columns == "C": # i.e. SeriesGroupBy if op in ["prod", "sum", "skew"]: @@ -2018,38 +2030,17 @@ def get_result(**kwargs): tm.assert_equal(result, expected) return - if (op in ["min", "max", "skew"] and isinstance(values, Categorical)) or ( - op == "skew" and df.dtypes[0].kind == "M" + if op == "skew" and ( + isinstance(values, Categorical) or df.dtypes[0].kind == "M" ): - if op == "skew" or len(keys) == 1: - msg = "|".join( - [ - "Categorical is not ordered", - "does not support reduction", - ] - ) - with pytest.raises(TypeError, match=msg): - get_result() - return - # Categorical doesn't implement, so with numeric_only=True - # these are dropped and we get an empty DataFrame back - result = get_result() - - # with numeric_only=True, these are dropped, and we get - # an empty DataFrame back - if len(keys) != 1: - # Categorical is special without 'observed=True' - lev = Categorical([0], dtype=values.dtype) - mi = MultiIndex.from_product([lev, lev], names=keys) - expected = DataFrame([], columns=[], index=mi) - else: - # all columns are dropped, but we end up with one row - # Categorical is special without 'observed=True' - lev = Categorical([0], dtype=values.dtype) - ci = Index(lev, name=keys[0]) - expected = DataFrame([], columns=[], index=ci) - - tm.assert_equal(result, expected) + msg = "|".join( + [ + "Categorical is not ordered", + "does not support reduction", + ] + ) + with pytest.raises(TypeError, match=msg): + get_result() return result = get_result() From 7926c6cf0b5c00595deed5da69d542a1609faf0e Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 27 Jan 2023 16:50:38 -0800 Subject: [PATCH 2/4] GH ref --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index c47740bf761bd..77a9bb76b7c4f 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -955,7 +955,7 @@ Categorical - Bug in :meth:`Series.replace` with categorical dtype losing nullable dtypes of underlying categories (:issue:`49404`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`) - Bug in :class:`Categorical` constructor when constructing from a :class:`Categorical` object and ``dtype="category"`` losing ordered-ness (:issue:`49309`) -- Bug in :meth:`SeriesGroupBy.min`, :meth:`SeriesGroupBy.max`, :meth:`DataFrameGroupBy.min`, and :meth:`DataFrameGroupBy.max` with unordered :class:`CategoricalDtype` with no groups failing to raise ``TypeError`` (:issue:`??`) +- Bug in :meth:`SeriesGroupBy.min`, :meth:`SeriesGroupBy.max`, :meth:`DataFrameGroupBy.min`, and :meth:`DataFrameGroupBy.max` with unordered :class:`CategoricalDtype` with no groups failing to raise ``TypeError`` (:issue:`51034`) - Datetimelike From 2c6e10d17d783739b2e7225e594369d19c25ce1f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 27 Jan 2023 19:09:04 -0800 Subject: [PATCH 3/4] update test --- pandas/tests/groupby/test_function.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 1e16e353cc1a4..1ef97fed7ba05 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -250,6 +250,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): [ "Categorical is not ordered", "function is not implemented for this dtype", + f"Cannot perform {method} with non-ordered Categorical", ] ) with pytest.raises(exception, match=msg): @@ -276,6 +277,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "category type does not support", "can't multiply sequence", "function is not implemented for this dtype", + f"Cannot perform {method} with non-ordered Categorical", ] ) with pytest.raises(exception, match=msg): From d331b9915477ffecf8c5b02994c534a9ab5b300a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 30 Jan 2023 12:33:05 -0800 Subject: [PATCH 4/4] test for non-ordered Categorical rank --- pandas/tests/groupby/test_rank.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 8bbe38d3379ac..d0b848a567346 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -13,6 +13,23 @@ import pandas._testing as tm +def test_rank_unordered_categorical_typeerror(): + # GH#51034 should be TypeError, not NotImplementedError + cat = pd.Categorical([], ordered=False) + ser = Series(cat) + df = ser.to_frame() + + msg = "Cannot perform rank with non-ordered Categorical" + + gb = ser.groupby(cat) + with pytest.raises(TypeError, match=msg): + gb.rank() + + gb2 = df.groupby(cat) + with pytest.raises(TypeError, match=msg): + gb2.rank() + + def test_rank_apply(): lev1 = tm.rands_array(10, 100) lev2 = tm.rands_array(10, 130)