Skip to content

BUG: GroupBy.min/max with unordered Categorical and no groups #51034

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -956,6 +956,7 @@ Categorical
- Bug in :meth:`Series.replace` with categorical dtype losing nullable dtypes of underlying categories (:issue:`49404`)
- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`)
- Bug in :class:`Categorical` constructor when constructing from a :class:`Categorical` object and ``dtype="category"`` losing ordered-ness (:issue:`49309`)
- Bug in :meth:`SeriesGroupBy.min`, :meth:`SeriesGroupBy.max`, :meth:`DataFrameGroupBy.min`, and :meth:`DataFrameGroupBy.max` with unordered :class:`CategoricalDtype` with no groups failing to raise ``TypeError`` (:issue:`51034`)
-

Datetimelike
Expand Down
15 changes: 8 additions & 7 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,9 +226,10 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False):

Raises
------
TypeError
This is not a valid operation for this dtype.
NotImplementedError
This is either not a valid function for this dtype, or
valid but not implemented in cython.
This may be a valid operation, but does not have a cython implementation.
"""
how = self.how

Expand All @@ -237,16 +238,16 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False):
return

if isinstance(dtype, CategoricalDtype):
# NotImplementedError for methods that can fall back to a
# non-cython implementation.
if how in ["sum", "prod", "cumsum", "cumprod"]:
raise TypeError(f"{dtype} type does not support {how} operations")
if how in ["min", "max", "rank"] and not dtype.ordered:
# raise TypeError instead of NotImplementedError to ensure we
# don't go down a group-by-group path, since in the empty-groups
# case that would fail to raise
raise TypeError(f"Cannot perform {how} with non-ordered Categorical")
if how not in ["rank"]:
# only "rank" is implemented in cython
raise NotImplementedError(f"{dtype} dtype not supported")
if not dtype.ordered:
# TODO: TypeError?
raise NotImplementedError(f"{dtype} dtype not supported")

elif is_sparse(dtype):
# categoricals are only 1d, so we
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
[
"Categorical is not ordered",
"function is not implemented for this dtype",
f"Cannot perform {method} with non-ordered Categorical",
]
)
with pytest.raises(exception, match=msg):
Expand All @@ -276,6 +277,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
"category type does not support",
"can't multiply sequence",
"function is not implemented for this dtype",
f"Cannot perform {method} with non-ordered Categorical",
]
)
with pytest.raises(exception, match=msg):
Expand Down
81 changes: 36 additions & 45 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1901,25 +1901,12 @@ def test_empty_groupby(
raises=ValueError, match="attempt to get arg(min|max) of an empty sequence"
)
request.node.add_marker(mark)
elif (
isinstance(values, Categorical)
and len(keys) == 1
and not isinstance(columns, list)
):
mark = pytest.mark.xfail(
raises=TypeError, match="'Categorical' does not implement"
)
request.node.add_marker(mark)
elif isinstance(values, Categorical) and len(keys) == 1 and op in ["sum", "prod"]:
mark = pytest.mark.xfail(
raises=AssertionError, match="(DataFrame|Series) are different"
)
request.node.add_marker(mark)
elif (
isinstance(values, Categorical)
and len(keys) == 2
and op in ["min", "max", "sum"]
):
elif isinstance(values, Categorical) and len(keys) == 2 and op in ["sum"]:
mark = pytest.mark.xfail(
raises=AssertionError, match="(DataFrame|Series) are different"
)
Expand Down Expand Up @@ -1949,6 +1936,31 @@ def get_result(**kwargs):
else:
return getattr(gb, method)(op, **kwargs)

if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]:
msg = f"Cannot perform {op} with non-ordered Categorical"
with pytest.raises(TypeError, match=msg):
get_result()

if isinstance(columns, list):
# i.e. DataframeGroupBy, not SeriesGroupBy
result = get_result(numeric_only=True)

# Categorical is special without 'observed=True', we get an NaN entry
# corresponding to the unobserved group. If we passed observed=True
# to groupby, expected would just be 'df.set_index(keys)[columns]'
# as below
lev = Categorical([0], dtype=values.dtype)
if len(keys) != 1:
idx = MultiIndex.from_product([lev, lev], names=keys)
else:
# all columns are dropped, but we end up with one row
# Categorical is special without 'observed=True'
idx = Index(lev, name=keys[0])

expected = DataFrame([], columns=[], index=idx)
tm.assert_equal(result, expected)
return

if columns == "C":
# i.e. SeriesGroupBy
if op in ["prod", "sum", "skew"]:
Expand Down Expand Up @@ -2018,38 +2030,17 @@ def get_result(**kwargs):
tm.assert_equal(result, expected)
return

if (op in ["min", "max", "skew"] and isinstance(values, Categorical)) or (
op == "skew" and df.dtypes[0].kind == "M"
if op == "skew" and (
isinstance(values, Categorical) or df.dtypes[0].kind == "M"
):
if op == "skew" or len(keys) == 1:
msg = "|".join(
[
"Categorical is not ordered",
"does not support reduction",
]
)
with pytest.raises(TypeError, match=msg):
get_result()
return
# Categorical doesn't implement, so with numeric_only=True
# these are dropped and we get an empty DataFrame back
result = get_result()

# with numeric_only=True, these are dropped, and we get
# an empty DataFrame back
if len(keys) != 1:
# Categorical is special without 'observed=True'
lev = Categorical([0], dtype=values.dtype)
mi = MultiIndex.from_product([lev, lev], names=keys)
expected = DataFrame([], columns=[], index=mi)
else:
# all columns are dropped, but we end up with one row
# Categorical is special without 'observed=True'
lev = Categorical([0], dtype=values.dtype)
ci = Index(lev, name=keys[0])
expected = DataFrame([], columns=[], index=ci)

tm.assert_equal(result, expected)
msg = "|".join(
[
"Categorical is not ordered",
"does not support reduction",
]
)
with pytest.raises(TypeError, match=msg):
get_result()
return

result = get_result()
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/groupby/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,23 @@
import pandas._testing as tm


def test_rank_unordered_categorical_typeerror():
# GH#51034 should be TypeError, not NotImplementedError
cat = pd.Categorical([], ordered=False)
ser = Series(cat)
df = ser.to_frame()

msg = "Cannot perform rank with non-ordered Categorical"

gb = ser.groupby(cat)
with pytest.raises(TypeError, match=msg):
gb.rank()

gb2 = df.groupby(cat)
with pytest.raises(TypeError, match=msg):
gb2.rank()


def test_rank_apply():
lev1 = tm.rands_array(10, 100)
lev2 = tm.rands_array(10, 130)
Expand Down