Skip to content

Commit e754d18

Browse files
authored
TST: fix test_empty_groupby xfails (#51075)
* BUG: GroupBy.min/max with unordered Categorical and no groups * GH ref * update test * test for non-ordered Categorical rank * test period case, fix most xfails * simplify
1 parent ebccd37 commit e754d18

File tree

1 file changed

+49
-102
lines changed

1 file changed

+49
-102
lines changed

pandas/tests/groupby/test_groupby.py

+49-102
Original file line numberDiff line numberDiff line change
@@ -1858,6 +1858,7 @@ def test_pivot_table_values_key_error():
18581858
Categorical([0]),
18591859
[to_datetime(0)],
18601860
date_range(0, 1, 1, tz="US/Eastern"),
1861+
pd.period_range("2016-01-01", periods=3, freq="D"),
18611862
pd.array([0], dtype="Int64"),
18621863
pd.array([0], dtype="Float64"),
18631864
pd.array([False], dtype="boolean"),
@@ -1870,6 +1871,7 @@ def test_pivot_table_values_key_error():
18701871
"cat",
18711872
"dt64",
18721873
"dt64tz",
1874+
"period",
18731875
"Int64",
18741876
"Float64",
18751877
"boolean",
@@ -1886,13 +1888,6 @@ def test_empty_groupby(
18861888
override_dtype = None
18871889

18881890
if (
1889-
isinstance(values, Categorical)
1890-
and not isinstance(columns, list)
1891-
and op in ["sum", "prod", "skew"]
1892-
):
1893-
# handled below GH#41291
1894-
pass
1895-
elif (
18961891
isinstance(values, Categorical)
18971892
and len(keys) == 1
18981893
and op in ["idxmax", "idxmin"]
@@ -1901,18 +1896,8 @@ def test_empty_groupby(
19011896
raises=ValueError, match="attempt to get arg(min|max) of an empty sequence"
19021897
)
19031898
request.node.add_marker(mark)
1904-
elif isinstance(values, Categorical) and len(keys) == 1 and op in ["sum", "prod"]:
1905-
mark = pytest.mark.xfail(
1906-
raises=AssertionError, match="(DataFrame|Series) are different"
1907-
)
1908-
request.node.add_marker(mark)
1909-
elif isinstance(values, Categorical) and len(keys) == 2 and op in ["sum"]:
1910-
mark = pytest.mark.xfail(
1911-
raises=AssertionError, match="(DataFrame|Series) are different"
1912-
)
1913-
request.node.add_marker(mark)
19141899

1915-
elif isinstance(values, BooleanArray) and op in ["sum", "prod"]:
1900+
if isinstance(values, BooleanArray) and op in ["sum", "prod"]:
19161901
# We expect to get Int64 back for these
19171902
override_dtype = "Int64"
19181903

@@ -1936,6 +1921,26 @@ def get_result(**kwargs):
19361921
else:
19371922
return getattr(gb, method)(op, **kwargs)
19381923

1924+
def get_categorical_invalid_expected():
1925+
# Categorical is special without 'observed=True', we get an NaN entry
1926+
# corresponding to the unobserved group. If we passed observed=True
1927+
# to groupby, expected would just be 'df.set_index(keys)[columns]'
1928+
# as below
1929+
lev = Categorical([0], dtype=values.dtype)
1930+
if len(keys) != 1:
1931+
idx = MultiIndex.from_product([lev, lev], names=keys)
1932+
else:
1933+
# all columns are dropped, but we end up with one row
1934+
# Categorical is special without 'observed=True'
1935+
idx = Index(lev, name=keys[0])
1936+
1937+
expected = DataFrame([], columns=[], index=idx)
1938+
return expected
1939+
1940+
is_per = isinstance(df.dtypes[0], pd.PeriodDtype)
1941+
is_dt64 = df.dtypes[0].kind == "M"
1942+
is_cat = isinstance(values, Categorical)
1943+
19391944
if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]:
19401945
msg = f"Cannot perform {op} with non-ordered Categorical"
19411946
with pytest.raises(TypeError, match=msg):
@@ -1944,105 +1949,47 @@ def get_result(**kwargs):
19441949
if isinstance(columns, list):
19451950
# i.e. DataframeGroupBy, not SeriesGroupBy
19461951
result = get_result(numeric_only=True)
1947-
1948-
# Categorical is special without 'observed=True', we get an NaN entry
1949-
# corresponding to the unobserved group. If we passed observed=True
1950-
# to groupby, expected would just be 'df.set_index(keys)[columns]'
1951-
# as below
1952-
lev = Categorical([0], dtype=values.dtype)
1953-
if len(keys) != 1:
1954-
idx = MultiIndex.from_product([lev, lev], names=keys)
1955-
else:
1956-
# all columns are dropped, but we end up with one row
1957-
# Categorical is special without 'observed=True'
1958-
idx = Index(lev, name=keys[0])
1959-
1960-
expected = DataFrame([], columns=[], index=idx)
1952+
expected = get_categorical_invalid_expected()
19611953
tm.assert_equal(result, expected)
19621954
return
19631955

1964-
if columns == "C":
1965-
# i.e. SeriesGroupBy
1966-
if op in ["prod", "sum", "skew"]:
1967-
# ops that require more than just ordered-ness
1968-
if df.dtypes[0].kind == "M":
1969-
# GH#41291
1970-
# datetime64 -> prod and sum are invalid
1971-
if op == "skew":
1972-
msg = "does not support reduction 'skew'"
1973-
else:
1974-
msg = "datetime64 type does not support"
1975-
with pytest.raises(TypeError, match=msg):
1976-
get_result()
1977-
1978-
return
1979-
if op in ["prod", "sum", "skew"]:
1980-
if isinstance(values, Categorical):
1981-
# GH#41291
1982-
if op == "skew":
1983-
msg = f"does not support reduction '{op}'"
1984-
else:
1985-
msg = "category type does not support"
1986-
with pytest.raises(TypeError, match=msg):
1987-
get_result()
1956+
if op in ["prod", "sum", "skew"]:
1957+
# ops that require more than just ordered-ness
1958+
if is_dt64 or is_cat or is_per:
1959+
# GH#41291
1960+
# datetime64 -> prod and sum are invalid
1961+
if op == "skew":
1962+
msg = "does not support reduction 'skew'"
1963+
elif is_dt64:
1964+
msg = "datetime64 type does not support"
1965+
elif is_per:
1966+
msg = "Period type does not support"
1967+
else:
1968+
msg = "category type does not support"
1969+
with pytest.raises(TypeError, match=msg):
1970+
get_result()
19881971

1972+
if not isinstance(columns, list):
1973+
# i.e. SeriesGroupBy
19891974
return
1990-
else:
1991-
# ie. DataFrameGroupBy
1992-
if op in ["prod", "sum"]:
1993-
# ops that require more than just ordered-ness
1994-
if df.dtypes[0].kind == "M":
1995-
# GH#41291
1996-
# datetime64 -> prod and sum are invalid
1997-
with pytest.raises(TypeError, match="datetime64 type does not support"):
1998-
get_result()
1999-
result = get_result(numeric_only=True)
2000-
2001-
# with numeric_only=True, these are dropped, and we get
2002-
# an empty DataFrame back
2003-
expected = df.set_index(keys)[[]]
2004-
tm.assert_equal(result, expected)
1975+
elif op == "skew":
1976+
# TODO: test the numeric_only=True case
20051977
return
2006-
2007-
elif isinstance(values, Categorical):
1978+
else:
1979+
# i.e. op in ["prod", "sum"]:
1980+
# i.e. DataFrameGroupBy
1981+
# ops that require more than just ordered-ness
20081982
# GH#41291
2009-
# Categorical doesn't implement sum or prod
2010-
with pytest.raises(TypeError, match="category type does not support"):
2011-
get_result()
20121983
result = get_result(numeric_only=True)
20131984

20141985
# with numeric_only=True, these are dropped, and we get
20151986
# an empty DataFrame back
20161987
expected = df.set_index(keys)[[]]
2017-
if len(keys) != 1 and op == "prod":
2018-
# TODO: why just prod and not sum?
2019-
# Categorical is special without 'observed=True'
2020-
lev = Categorical([0], dtype=values.dtype)
2021-
mi = MultiIndex.from_product([lev, lev], names=["A", "B"])
2022-
expected = DataFrame([], columns=[], index=mi)
2023-
2024-
tm.assert_equal(result, expected)
2025-
return
2026-
2027-
elif df.dtypes[0] == object:
2028-
result = get_result()
2029-
expected = df.set_index(keys)[["C"]]
1988+
if is_cat:
1989+
expected = get_categorical_invalid_expected()
20301990
tm.assert_equal(result, expected)
20311991
return
20321992

2033-
if op == "skew" and (
2034-
isinstance(values, Categorical) or df.dtypes[0].kind == "M"
2035-
):
2036-
msg = "|".join(
2037-
[
2038-
"Categorical is not ordered",
2039-
"does not support reduction",
2040-
]
2041-
)
2042-
with pytest.raises(TypeError, match=msg):
2043-
get_result()
2044-
return
2045-
20461993
result = get_result()
20471994
expected = df.set_index(keys)[columns]
20481995
if override_dtype is not None:

0 commit comments

Comments
 (0)