diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 4f23a23992c54..f11397e5b8ada 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -956,6 +956,7 @@ Categorical - Bug in :meth:`Series.replace` with categorical dtype losing nullable dtypes of underlying categories (:issue:`49404`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`) - Bug in :class:`Categorical` constructor when constructing from a :class:`Categorical` object and ``dtype="category"`` losing ordered-ness (:issue:`49309`) +- Bug in :meth:`SeriesGroupBy.min`, :meth:`SeriesGroupBy.max`, :meth:`DataFrameGroupBy.min`, and :meth:`DataFrameGroupBy.max` with unordered :class:`CategoricalDtype` with no groups failing to raise ``TypeError`` (:issue:`51034`) - Datetimelike diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index f88236b2464c1..94f396ccef6b1 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -226,9 +226,10 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): Raises ------ + TypeError + This is not a valid operation for this dtype. NotImplementedError - This is either not a valid function for this dtype, or - valid but not implemented in cython. + This may be a valid operation, but does not have a cython implementation. """ how = self.how @@ -237,16 +238,16 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): return if isinstance(dtype, CategoricalDtype): - # NotImplementedError for methods that can fall back to a - # non-cython implementation. if how in ["sum", "prod", "cumsum", "cumprod"]: raise TypeError(f"{dtype} type does not support {how} operations") + if how in ["min", "max", "rank"] and not dtype.ordered: + # raise TypeError instead of NotImplementedError to ensure we + # don't go down a group-by-group path, since in the empty-groups + # case that would fail to raise + raise TypeError(f"Cannot perform {how} with non-ordered Categorical") if how not in ["rank"]: # only "rank" is implemented in cython raise NotImplementedError(f"{dtype} dtype not supported") - if not dtype.ordered: - # TODO: TypeError? - raise NotImplementedError(f"{dtype} dtype not supported") elif is_sparse(dtype): # categoricals are only 1d, so we diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 1e16e353cc1a4..1ef97fed7ba05 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -250,6 +250,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): [ "Categorical is not ordered", "function is not implemented for this dtype", + f"Cannot perform {method} with non-ordered Categorical", ] ) with pytest.raises(exception, match=msg): @@ -276,6 +277,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "category type does not support", "can't multiply sequence", "function is not implemented for this dtype", + f"Cannot perform {method} with non-ordered Categorical", ] ) with pytest.raises(exception, match=msg): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a1c1930c2e11b..988b8a67babff 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1901,25 +1901,12 @@ def test_empty_groupby( raises=ValueError, match="attempt to get arg(min|max) of an empty sequence" ) request.node.add_marker(mark) - elif ( - isinstance(values, Categorical) - and len(keys) == 1 - and not isinstance(columns, list) - ): - mark = pytest.mark.xfail( - raises=TypeError, match="'Categorical' does not implement" - ) - request.node.add_marker(mark) elif isinstance(values, Categorical) and len(keys) == 1 and op in ["sum", "prod"]: mark = pytest.mark.xfail( raises=AssertionError, match="(DataFrame|Series) are different" ) request.node.add_marker(mark) - elif ( - isinstance(values, Categorical) - and len(keys) == 2 - and op in ["min", "max", "sum"] - ): + elif isinstance(values, Categorical) and len(keys) == 2 and op in ["sum"]: mark = pytest.mark.xfail( raises=AssertionError, match="(DataFrame|Series) are different" ) @@ -1949,6 +1936,31 @@ def get_result(**kwargs): else: return getattr(gb, method)(op, **kwargs) + if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]: + msg = f"Cannot perform {op} with non-ordered Categorical" + with pytest.raises(TypeError, match=msg): + get_result() + + if isinstance(columns, list): + # i.e. DataframeGroupBy, not SeriesGroupBy + result = get_result(numeric_only=True) + + # Categorical is special without 'observed=True', we get an NaN entry + # corresponding to the unobserved group. If we passed observed=True + # to groupby, expected would just be 'df.set_index(keys)[columns]' + # as below + lev = Categorical([0], dtype=values.dtype) + if len(keys) != 1: + idx = MultiIndex.from_product([lev, lev], names=keys) + else: + # all columns are dropped, but we end up with one row + # Categorical is special without 'observed=True' + idx = Index(lev, name=keys[0]) + + expected = DataFrame([], columns=[], index=idx) + tm.assert_equal(result, expected) + return + if columns == "C": # i.e. SeriesGroupBy if op in ["prod", "sum", "skew"]: @@ -2018,38 +2030,17 @@ def get_result(**kwargs): tm.assert_equal(result, expected) return - if (op in ["min", "max", "skew"] and isinstance(values, Categorical)) or ( - op == "skew" and df.dtypes[0].kind == "M" + if op == "skew" and ( + isinstance(values, Categorical) or df.dtypes[0].kind == "M" ): - if op == "skew" or len(keys) == 1: - msg = "|".join( - [ - "Categorical is not ordered", - "does not support reduction", - ] - ) - with pytest.raises(TypeError, match=msg): - get_result() - return - # Categorical doesn't implement, so with numeric_only=True - # these are dropped and we get an empty DataFrame back - result = get_result() - - # with numeric_only=True, these are dropped, and we get - # an empty DataFrame back - if len(keys) != 1: - # Categorical is special without 'observed=True' - lev = Categorical([0], dtype=values.dtype) - mi = MultiIndex.from_product([lev, lev], names=keys) - expected = DataFrame([], columns=[], index=mi) - else: - # all columns are dropped, but we end up with one row - # Categorical is special without 'observed=True' - lev = Categorical([0], dtype=values.dtype) - ci = Index(lev, name=keys[0]) - expected = DataFrame([], columns=[], index=ci) - - tm.assert_equal(result, expected) + msg = "|".join( + [ + "Categorical is not ordered", + "does not support reduction", + ] + ) + with pytest.raises(TypeError, match=msg): + get_result() return result = get_result() diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 8bbe38d3379ac..d0b848a567346 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -13,6 +13,23 @@ import pandas._testing as tm +def test_rank_unordered_categorical_typeerror(): + # GH#51034 should be TypeError, not NotImplementedError + cat = pd.Categorical([], ordered=False) + ser = Series(cat) + df = ser.to_frame() + + msg = "Cannot perform rank with non-ordered Categorical" + + gb = ser.groupby(cat) + with pytest.raises(TypeError, match=msg): + gb.rank() + + gb2 = df.groupby(cat) + with pytest.raises(TypeError, match=msg): + gb2.rank() + + def test_rank_apply(): lev1 = tm.rands_array(10, 100) lev2 = tm.rands_array(10, 130)