Skip to content

Commit 2ed252a

Browse files
authored
TST: add test to ensure that df.groupby() returns the missing categories when grouping on 2 pd.Categoricals (#35022)
1 parent 16bbae0 commit 2ed252a

File tree

1 file changed

+142
-32
lines changed

1 file changed

+142
-32
lines changed

pandas/tests/groupby/test_categorical.py

+142-32
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,41 @@ def f(a):
3636
return result.reindex(index).sort_index()
3737

3838

39+
_results_for_groupbys_with_missing_categories = dict(
40+
# This maps the builtin groupby functions to their expected outputs for
41+
# missing categories when they are called on a categorical grouper with
42+
# observed=False. Some functions are expected to return NaN, some zero.
43+
# These expected values can be used across several tests (i.e. they are
44+
# the same for SeriesGroupBy and DataFrameGroupBy) but they should only be
45+
# hardcoded in one place.
46+
[
47+
("all", np.NaN),
48+
("any", np.NaN),
49+
("count", 0),
50+
("corrwith", np.NaN),
51+
("first", np.NaN),
52+
("idxmax", np.NaN),
53+
("idxmin", np.NaN),
54+
("last", np.NaN),
55+
("mad", np.NaN),
56+
("max", np.NaN),
57+
("mean", np.NaN),
58+
("median", np.NaN),
59+
("min", np.NaN),
60+
("nth", np.NaN),
61+
("nunique", 0),
62+
("prod", np.NaN),
63+
("quantile", np.NaN),
64+
("sem", np.NaN),
65+
("size", 0),
66+
("skew", np.NaN),
67+
("std", np.NaN),
68+
("sum", 0),
69+
("var", np.NaN),
70+
]
71+
)
72+
73+
3974
def test_apply_use_categorical_name(df):
4075
cats = qcut(df.C, 4)
4176

@@ -1263,12 +1298,13 @@ def test_series_groupby_on_2_categoricals_unobserved(
12631298
reduction_func: str, observed: bool, request
12641299
):
12651300
# GH 17605
1266-
12671301
if reduction_func == "ngroup":
12681302
pytest.skip("ngroup is not truly a reduction")
12691303

12701304
if reduction_func == "corrwith": # GH 32293
1271-
mark = pytest.mark.xfail(reason="TODO: implemented SeriesGroupBy.corrwith")
1305+
mark = pytest.mark.xfail(
1306+
reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293"
1307+
)
12721308
request.node.add_marker(mark)
12731309

12741310
df = pd.DataFrame(
@@ -1289,36 +1325,30 @@ def test_series_groupby_on_2_categoricals_unobserved(
12891325
assert len(result) == expected_length
12901326

12911327

1292-
@pytest.mark.parametrize(
1293-
"func, zero_or_nan",
1294-
[
1295-
("all", np.NaN),
1296-
("any", np.NaN),
1297-
("count", 0),
1298-
("first", np.NaN),
1299-
("idxmax", np.NaN),
1300-
("idxmin", np.NaN),
1301-
("last", np.NaN),
1302-
("mad", np.NaN),
1303-
("max", np.NaN),
1304-
("mean", np.NaN),
1305-
("median", np.NaN),
1306-
("min", np.NaN),
1307-
("nth", np.NaN),
1308-
("nunique", 0),
1309-
("prod", np.NaN),
1310-
("quantile", np.NaN),
1311-
("sem", np.NaN),
1312-
("size", 0),
1313-
("skew", np.NaN),
1314-
("std", np.NaN),
1315-
("sum", np.NaN),
1316-
("var", np.NaN),
1317-
],
1318-
)
1319-
def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan):
1328+
def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
1329+
reduction_func: str, request
1330+
):
13201331
# GH 17605
13211332
# Tests whether the unobserved categories in the result contain 0 or NaN
1333+
1334+
if reduction_func == "ngroup":
1335+
pytest.skip("ngroup is not truly a reduction")
1336+
1337+
if reduction_func == "corrwith": # GH 32293
1338+
mark = pytest.mark.xfail(
1339+
reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293"
1340+
)
1341+
request.node.add_marker(mark)
1342+
1343+
if reduction_func == "sum": # GH 31422
1344+
mark = pytest.mark.xfail(
1345+
reason=(
1346+
"sum should return 0 but currently returns NaN. "
1347+
"This is a known bug. See GH 31422."
1348+
)
1349+
)
1350+
request.node.add_marker(mark)
1351+
13221352
df = pd.DataFrame(
13231353
{
13241354
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
@@ -1327,12 +1357,14 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_o
13271357
}
13281358
)
13291359
unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")]
1330-
args = {"nth": [0]}.get(func, [])
1360+
args = {"nth": [0]}.get(reduction_func, [])
13311361

13321362
series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
1333-
agg = getattr(series_groupby, func)
1363+
agg = getattr(series_groupby, reduction_func)
13341364
result = agg(*args)
13351365

1366+
zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func]
1367+
13361368
for idx in unobserved:
13371369
val = result.loc[idx]
13381370
assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)
@@ -1342,6 +1374,84 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_o
13421374
assert np.issubdtype(result.dtype, np.integer)
13431375

13441376

1377+
def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func: str):
1378+
# GH 23865
1379+
# GH 27075
1380+
# Ensure that df.groupby, when 'by' is two pd.Categorical variables,
1381+
# does not return the categories that are not in df when observed=True
1382+
if reduction_func == "ngroup":
1383+
pytest.skip("ngroup does not return the Categories on the index")
1384+
1385+
df = pd.DataFrame(
1386+
{
1387+
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
1388+
"cat_2": pd.Categorical(list("1111"), categories=list("12")),
1389+
"value": [0.1, 0.1, 0.1, 0.1],
1390+
}
1391+
)
1392+
unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")]
1393+
1394+
df_grp = df.groupby(["cat_1", "cat_2"], observed=True)
1395+
1396+
args = {"nth": [0], "corrwith": [df]}.get(reduction_func, [])
1397+
res = getattr(df_grp, reduction_func)(*args)
1398+
1399+
for cat in unobserved_cats:
1400+
assert cat not in res.index
1401+
1402+
1403+
@pytest.mark.parametrize("observed", [False, None])
1404+
def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
1405+
reduction_func: str, observed: bool, request
1406+
):
1407+
# GH 23865
1408+
# GH 27075
1409+
# Ensure that df.groupby, when 'by' is two pd.Categorical variables,
1410+
# returns the categories that are not in df when observed=False/None
1411+
1412+
if reduction_func == "ngroup":
1413+
pytest.skip("ngroup does not return the Categories on the index")
1414+
1415+
if reduction_func == "count": # GH 35028
1416+
mark = pytest.mark.xfail(
1417+
reason=(
1418+
"DataFrameGroupBy.count returns np.NaN for missing "
1419+
"categories, when it should return 0. See GH 35028"
1420+
)
1421+
)
1422+
request.node.add_marker(mark)
1423+
1424+
if reduction_func == "sum": # GH 31422
1425+
mark = pytest.mark.xfail(
1426+
reason=(
1427+
"sum should return 0 but currently returns NaN. "
1428+
"This is a known bug. See GH 31422."
1429+
)
1430+
)
1431+
request.node.add_marker(mark)
1432+
1433+
df = pd.DataFrame(
1434+
{
1435+
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
1436+
"cat_2": pd.Categorical(list("1111"), categories=list("12")),
1437+
"value": [0.1, 0.1, 0.1, 0.1],
1438+
}
1439+
)
1440+
unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")]
1441+
1442+
df_grp = df.groupby(["cat_1", "cat_2"], observed=observed)
1443+
1444+
args = {"nth": [0], "corrwith": [df]}.get(reduction_func, [])
1445+
res = getattr(df_grp, reduction_func)(*args)
1446+
1447+
expected = _results_for_groupbys_with_missing_categories[reduction_func]
1448+
1449+
if expected is np.nan:
1450+
assert res.loc[unobserved_cats].isnull().all().all()
1451+
else:
1452+
assert (res.loc[unobserved_cats] == expected).all().all()
1453+
1454+
13451455
def test_series_groupby_categorical_aggregation_getitem():
13461456
# GH 8870
13471457
d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]}

0 commit comments

Comments
 (0)