Skip to content

Commit 39f72da

Browse files
authored
BUG: GroupBy.min/max with unordered Categorical and no groups (#51034)
* BUG: GroupBy.min/max with unordered Categorical and no groups * GH ref * update test * test for non-ordered Categorical rank
1 parent ae61ccd commit 39f72da

File tree

5 files changed

+64
-52
lines changed

5 files changed

+64
-52
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -959,6 +959,7 @@ Categorical
959959
- Bug in :meth:`Series.replace` with categorical dtype losing nullable dtypes of underlying categories (:issue:`49404`)
960960
- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`)
961961
- Bug in :class:`Categorical` constructor when constructing from a :class:`Categorical` object and ``dtype="category"`` losing ordered-ness (:issue:`49309`)
962+
- Bug in :meth:`SeriesGroupBy.min`, :meth:`SeriesGroupBy.max`, :meth:`DataFrameGroupBy.min`, and :meth:`DataFrameGroupBy.max` with unordered :class:`CategoricalDtype` with no groups failing to raise ``TypeError`` (:issue:`51034`)
962963
-
963964

964965
Datetimelike

pandas/core/groupby/ops.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -226,9 +226,10 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False):
226226
227227
Raises
228228
------
229+
TypeError
230+
This is not a valid operation for this dtype.
229231
NotImplementedError
230-
This is either not a valid function for this dtype, or
231-
valid but not implemented in cython.
232+
This may be a valid operation, but does not have a cython implementation.
232233
"""
233234
how = self.how
234235

@@ -237,16 +238,16 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False):
237238
return
238239

239240
if isinstance(dtype, CategoricalDtype):
240-
# NotImplementedError for methods that can fall back to a
241-
# non-cython implementation.
242241
if how in ["sum", "prod", "cumsum", "cumprod"]:
243242
raise TypeError(f"{dtype} type does not support {how} operations")
243+
if how in ["min", "max", "rank"] and not dtype.ordered:
244+
# raise TypeError instead of NotImplementedError to ensure we
245+
# don't go down a group-by-group path, since in the empty-groups
246+
# case that would fail to raise
247+
raise TypeError(f"Cannot perform {how} with non-ordered Categorical")
244248
if how not in ["rank"]:
245249
# only "rank" is implemented in cython
246250
raise NotImplementedError(f"{dtype} dtype not supported")
247-
if not dtype.ordered:
248-
# TODO: TypeError?
249-
raise NotImplementedError(f"{dtype} dtype not supported")
250251

251252
elif is_sparse(dtype):
252253
raise NotImplementedError(f"{dtype} dtype not supported")

pandas/tests/groupby/test_function.py

+2
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
250250
[
251251
"Categorical is not ordered",
252252
"function is not implemented for this dtype",
253+
f"Cannot perform {method} with non-ordered Categorical",
253254
]
254255
)
255256
with pytest.raises(exception, match=msg):
@@ -276,6 +277,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
276277
"category type does not support",
277278
"can't multiply sequence",
278279
"function is not implemented for this dtype",
280+
f"Cannot perform {method} with non-ordered Categorical",
279281
]
280282
)
281283
with pytest.raises(exception, match=msg):

pandas/tests/groupby/test_groupby.py

+36-45
Original file line numberDiff line numberDiff line change
@@ -1901,25 +1901,12 @@ def test_empty_groupby(
19011901
raises=ValueError, match="attempt to get arg(min|max) of an empty sequence"
19021902
)
19031903
request.node.add_marker(mark)
1904-
elif (
1905-
isinstance(values, Categorical)
1906-
and len(keys) == 1
1907-
and not isinstance(columns, list)
1908-
):
1909-
mark = pytest.mark.xfail(
1910-
raises=TypeError, match="'Categorical' does not implement"
1911-
)
1912-
request.node.add_marker(mark)
19131904
elif isinstance(values, Categorical) and len(keys) == 1 and op in ["sum", "prod"]:
19141905
mark = pytest.mark.xfail(
19151906
raises=AssertionError, match="(DataFrame|Series) are different"
19161907
)
19171908
request.node.add_marker(mark)
1918-
elif (
1919-
isinstance(values, Categorical)
1920-
and len(keys) == 2
1921-
and op in ["min", "max", "sum"]
1922-
):
1909+
elif isinstance(values, Categorical) and len(keys) == 2 and op in ["sum"]:
19231910
mark = pytest.mark.xfail(
19241911
raises=AssertionError, match="(DataFrame|Series) are different"
19251912
)
@@ -1949,6 +1936,31 @@ def get_result(**kwargs):
19491936
else:
19501937
return getattr(gb, method)(op, **kwargs)
19511938

1939+
if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]:
1940+
msg = f"Cannot perform {op} with non-ordered Categorical"
1941+
with pytest.raises(TypeError, match=msg):
1942+
get_result()
1943+
1944+
if isinstance(columns, list):
1945+
# i.e. DataframeGroupBy, not SeriesGroupBy
1946+
result = get_result(numeric_only=True)
1947+
1948+
# Categorical is special without 'observed=True', we get an NaN entry
1949+
# corresponding to the unobserved group. If we passed observed=True
1950+
# to groupby, expected would just be 'df.set_index(keys)[columns]'
1951+
# as below
1952+
lev = Categorical([0], dtype=values.dtype)
1953+
if len(keys) != 1:
1954+
idx = MultiIndex.from_product([lev, lev], names=keys)
1955+
else:
1956+
# all columns are dropped, but we end up with one row
1957+
# Categorical is special without 'observed=True'
1958+
idx = Index(lev, name=keys[0])
1959+
1960+
expected = DataFrame([], columns=[], index=idx)
1961+
tm.assert_equal(result, expected)
1962+
return
1963+
19521964
if columns == "C":
19531965
# i.e. SeriesGroupBy
19541966
if op in ["prod", "sum", "skew"]:
@@ -2018,38 +2030,17 @@ def get_result(**kwargs):
20182030
tm.assert_equal(result, expected)
20192031
return
20202032

2021-
if (op in ["min", "max", "skew"] and isinstance(values, Categorical)) or (
2022-
op == "skew" and df.dtypes[0].kind == "M"
2033+
if op == "skew" and (
2034+
isinstance(values, Categorical) or df.dtypes[0].kind == "M"
20232035
):
2024-
if op == "skew" or len(keys) == 1:
2025-
msg = "|".join(
2026-
[
2027-
"Categorical is not ordered",
2028-
"does not support reduction",
2029-
]
2030-
)
2031-
with pytest.raises(TypeError, match=msg):
2032-
get_result()
2033-
return
2034-
# Categorical doesn't implement, so with numeric_only=True
2035-
# these are dropped and we get an empty DataFrame back
2036-
result = get_result()
2037-
2038-
# with numeric_only=True, these are dropped, and we get
2039-
# an empty DataFrame back
2040-
if len(keys) != 1:
2041-
# Categorical is special without 'observed=True'
2042-
lev = Categorical([0], dtype=values.dtype)
2043-
mi = MultiIndex.from_product([lev, lev], names=keys)
2044-
expected = DataFrame([], columns=[], index=mi)
2045-
else:
2046-
# all columns are dropped, but we end up with one row
2047-
# Categorical is special without 'observed=True'
2048-
lev = Categorical([0], dtype=values.dtype)
2049-
ci = Index(lev, name=keys[0])
2050-
expected = DataFrame([], columns=[], index=ci)
2051-
2052-
tm.assert_equal(result, expected)
2036+
msg = "|".join(
2037+
[
2038+
"Categorical is not ordered",
2039+
"does not support reduction",
2040+
]
2041+
)
2042+
with pytest.raises(TypeError, match=msg):
2043+
get_result()
20532044
return
20542045

20552046
result = get_result()

pandas/tests/groupby/test_rank.py

+17
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,23 @@
1313
import pandas._testing as tm
1414

1515

16+
def test_rank_unordered_categorical_typeerror():
17+
# GH#51034 should be TypeError, not NotImplementedError
18+
cat = pd.Categorical([], ordered=False)
19+
ser = Series(cat)
20+
df = ser.to_frame()
21+
22+
msg = "Cannot perform rank with non-ordered Categorical"
23+
24+
gb = ser.groupby(cat)
25+
with pytest.raises(TypeError, match=msg):
26+
gb.rank()
27+
28+
gb2 = df.groupby(cat)
29+
with pytest.raises(TypeError, match=msg):
30+
gb2.rank()
31+
32+
1633
def test_rank_apply():
1734
lev1 = tm.rands_array(10, 100)
1835
lev2 = tm.rands_array(10, 130)

0 commit comments

Comments
 (0)