Skip to content

Commit eea9e75

Browse files
authored
BUG: groupby with CategoricalIndex doesn't include unobserved categories (#49373)
* BUG: groupby with CategoricalIndex doesn't include unobserved categories * Test fixup * cleanup * Remove TODO * Add GH reference * Add GH reference
1 parent 4bea299 commit eea9e75

File tree

5 files changed

+93
-28
lines changed

5 files changed

+93
-28
lines changed

doc/source/whatsnew/v2.0.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -585,7 +585,8 @@ Groupby/resample/rolling
585585
- Bug in :meth:`DataFrameGroupBy.sample` raises ``ValueError`` when the object is empty (:issue:`48459`)
586586
- Bug in :meth:`Series.groupby` raises ``ValueError`` when an entry of the index is equal to the name of the index (:issue:`48567`)
587587
- Bug in :meth:`DataFrameGroupBy.resample` produces inconsistent results when passing empty DataFrame (:issue:`47705`)
588-
-
588+
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would not include unobserved categories in result when grouping by categorical indexes (:issue:`49354`)
589+
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would change result order depending on the input index when grouping by categoricals (:issue:`49223`)
589590

590591
Reshaping
591592
^^^^^^^^^

pandas/core/groupby/categorical.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def recode_for_groupby(
5353
unique_codes = unique1d(c.codes)
5454

5555
take_codes = unique_codes[unique_codes != -1]
56-
if c.ordered:
56+
if c.ordered or sort:
5757
take_codes = np.sort(take_codes)
5858

5959
# we recode according to the uniques

pandas/core/groupby/grouper.py

+13-10
Original file line numberDiff line numberDiff line change
@@ -476,11 +476,15 @@ def __init__(
476476
# In extant tests, the new self.grouping_vector matches
477477
# `index.get_level_values(ilevel)` whenever
478478
# mapper is None and isinstance(index, MultiIndex)
479+
if isinstance(index, MultiIndex):
480+
index_level = index.get_level_values(ilevel)
481+
else:
482+
index_level = index
479483
(
480484
self.grouping_vector, # Index
481485
self._codes,
482486
self._group_index,
483-
) = index._get_grouper_for_level(mapper, level=ilevel, dropna=dropna)
487+
) = index_level._get_grouper_for_level(mapper, dropna=dropna)
484488

485489
# a passed Grouper like, directly get the grouper in the same way
486490
# as single grouper groupby, use the group_info to get codes
@@ -504,15 +508,6 @@ def __init__(
504508
# use Index instead of ndarray so we can recover the name
505509
self.grouping_vector = Index(ng, name=newgrouper.result_index.name)
506510

507-
elif is_categorical_dtype(self.grouping_vector):
508-
# a passed Categorical
509-
self._passed_categorical = True
510-
511-
self._orig_cats = self.grouping_vector.categories
512-
self.grouping_vector, self._all_grouper = recode_for_groupby(
513-
self.grouping_vector, sort, observed
514-
)
515-
516511
elif not isinstance(
517512
self.grouping_vector, (Series, Index, ExtensionArray, np.ndarray)
518513
):
@@ -542,6 +537,14 @@ def __init__(
542537
# TODO 2022-10-08 we only have one test that gets here and
543538
# values are already in nanoseconds in that case.
544539
self.grouping_vector = Series(self.grouping_vector).to_numpy()
540+
elif is_categorical_dtype(self.grouping_vector):
541+
# a passed Categorical
542+
self._passed_categorical = True
543+
544+
self._orig_cats = self.grouping_vector.categories
545+
self.grouping_vector, self._all_grouper = recode_for_groupby(
546+
self.grouping_vector, sort, observed
547+
)
545548

546549
def __repr__(self) -> str:
547550
return f"Grouping({self.name})"

pandas/tests/groupby/test_apply.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -870,14 +870,20 @@ def test_apply_multi_level_name(category):
870870
b = [1, 2] * 5
871871
if category:
872872
b = pd.Categorical(b, categories=[1, 2, 3])
873-
expected_index = pd.CategoricalIndex([1, 2], categories=[1, 2, 3], name="B")
873+
expected_index = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3], name="B")
874+
# GH#40669 - summing an empty frame gives float dtype
875+
expected_values = [20.0, 25.0, 0.0]
874876
else:
875877
expected_index = Index([1, 2], name="B")
878+
expected_values = [20, 25]
879+
expected = DataFrame(
880+
{"C": expected_values, "D": expected_values}, index=expected_index
881+
)
882+
876883
df = DataFrame(
877884
{"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))}
878885
).set_index(["A", "B"])
879886
result = df.groupby("B").apply(lambda x: x.sum())
880-
expected = DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index)
881887
tm.assert_frame_equal(result, expected)
882888
assert df.index.names == ["A", "B"]
883889

pandas/tests/groupby/test_categorical.py

+69-14
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,60 @@ def test_observed_groups(observed):
487487
tm.assert_dict_equal(result, expected)
488488

489489

490+
@pytest.mark.parametrize(
491+
"keys, expected_values, expected_index_levels",
492+
[
493+
("a", [15, 9, 0], CategoricalIndex([1, 2, 3], name="a")),
494+
(
495+
["a", "b"],
496+
[7, 8, 0, 0, 0, 9, 0, 0, 0],
497+
[CategoricalIndex([1, 2, 3], name="a"), Index([4, 5, 6])],
498+
),
499+
(
500+
["a", "a2"],
501+
[15, 0, 0, 0, 9, 0, 0, 0, 0],
502+
[
503+
CategoricalIndex([1, 2, 3], name="a"),
504+
CategoricalIndex([1, 2, 3], name="a"),
505+
],
506+
),
507+
],
508+
)
509+
@pytest.mark.parametrize("test_series", [True, False])
510+
def test_unobserved_in_index(keys, expected_values, expected_index_levels, test_series):
511+
# GH#49354 - ensure unobserved cats occur when grouping by index levels
512+
df = DataFrame(
513+
{
514+
"a": Categorical([1, 1, 2], categories=[1, 2, 3]),
515+
"a2": Categorical([1, 1, 2], categories=[1, 2, 3]),
516+
"b": [4, 5, 6],
517+
"c": [7, 8, 9],
518+
}
519+
).set_index(["a", "a2"])
520+
if "b" not in keys:
521+
# Only keep b when it is used for grouping for consistent columns in the result
522+
df = df.drop(columns="b")
523+
524+
gb = df.groupby(keys, observed=False)
525+
if test_series:
526+
gb = gb["c"]
527+
result = gb.sum()
528+
529+
if len(keys) == 1:
530+
index = expected_index_levels
531+
else:
532+
codes = [[0, 0, 0, 1, 1, 1, 2, 2, 2], 3 * [0, 1, 2]]
533+
index = MultiIndex(
534+
expected_index_levels,
535+
codes=codes,
536+
names=keys,
537+
)
538+
expected = DataFrame({"c": expected_values}, index=index)
539+
if test_series:
540+
expected = expected["c"]
541+
tm.assert_equal(result, expected)
542+
543+
490544
def test_observed_groups_with_nan(observed):
491545
# GH 24740
492546
df = DataFrame(
@@ -1234,11 +1288,12 @@ def df_cat(df):
12341288

12351289
@pytest.mark.parametrize("operation", ["agg", "apply"])
12361290
def test_seriesgroupby_observed_true(df_cat, operation):
1237-
# GH 24880
1238-
lev_a = Index(["foo", "foo", "bar", "bar"], dtype=df_cat["A"].dtype, name="A")
1239-
lev_b = Index(["one", "two", "one", "three"], dtype=df_cat["B"].dtype, name="B")
1291+
# GH#24880
1292+
# GH#49223 - order of results was wrong when grouping by index levels
1293+
lev_a = Index(["bar", "bar", "foo", "foo"], dtype=df_cat["A"].dtype, name="A")
1294+
lev_b = Index(["one", "three", "one", "two"], dtype=df_cat["B"].dtype, name="B")
12401295
index = MultiIndex.from_arrays([lev_a, lev_b])
1241-
expected = Series(data=[1, 3, 2, 4], index=index, name="C")
1296+
expected = Series(data=[2, 4, 1, 3], index=index, name="C")
12421297

12431298
grouped = df_cat.groupby(["A", "B"], observed=True)["C"]
12441299
result = getattr(grouped, operation)(sum)
@@ -1249,6 +1304,7 @@ def test_seriesgroupby_observed_true(df_cat, operation):
12491304
@pytest.mark.parametrize("observed", [False, None])
12501305
def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
12511306
# GH 24880
1307+
# GH#49223 - order of results was wrong when grouping by index levels
12521308
index, _ = MultiIndex.from_product(
12531309
[
12541310
CategoricalIndex(["bar", "foo"], ordered=False),
@@ -1272,16 +1328,16 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
12721328
True,
12731329
MultiIndex.from_arrays(
12741330
[
1275-
Index(["foo"] * 4 + ["bar"] * 4, dtype="category", name="A"),
1331+
Index(["bar"] * 4 + ["foo"] * 4, dtype="category", name="A"),
12761332
Index(
1277-
["one", "one", "two", "two", "one", "one", "three", "three"],
1333+
["one", "one", "three", "three", "one", "one", "two", "two"],
12781334
dtype="category",
12791335
name="B",
12801336
),
12811337
Index(["min", "max"] * 4),
12821338
]
12831339
),
1284-
[1, 1, 3, 3, 2, 2, 4, 4],
1340+
[2, 2, 4, 4, 1, 1, 3, 3],
12851341
),
12861342
(
12871343
False,
@@ -1857,7 +1913,7 @@ def test_category_order_reducer(
18571913
if (
18581914
reduction_func in ("idxmax", "idxmin")
18591915
and not observed
1860-
and index_kind == "range"
1916+
and index_kind != "multi"
18611917
):
18621918
msg = "GH#10694 - idxmax/min fail with unused categories"
18631919
request.node.add_marker(pytest.mark.xfail(reason=msg))
@@ -2005,10 +2061,13 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde
20052061

20062062

20072063
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
2008-
def test_many_categories(as_index, sort, index_kind, ordered):
2064+
def test_many_categories(request, as_index, sort, index_kind, ordered):
20092065
# GH#48749 - Test when the grouper has many categories
20102066
if index_kind != "range" and not as_index:
20112067
pytest.skip(reason="Result doesn't have categories, nothing to test")
2068+
if index_kind == "multi" and as_index and not sort and ordered:
2069+
msg = "GH#48749 - values are unsorted even though the Categorical is ordered"
2070+
request.node.add_marker(pytest.mark.xfail(reason=msg))
20122071
categories = np.arange(9999, -1, -1)
20132072
grouper = Categorical([2, 1, 2, 3], categories=categories, ordered=ordered)
20142073
df = DataFrame({"a": grouper, "b": range(4)})
@@ -2025,11 +2084,7 @@ def test_many_categories(as_index, sort, index_kind, ordered):
20252084
result = gb.sum()
20262085

20272086
# Test is setup so that data and index are the same values
2028-
# TODO: GH#49223 - Order of values should be the same for all index_kinds
2029-
if index_kind == "range":
2030-
data = [3, 2, 1] if ordered else [2, 1, 3]
2031-
else:
2032-
data = [3, 2, 1] if sort else [2, 1, 3]
2087+
data = [3, 2, 1] if sort or ordered else [2, 1, 3]
20332088

20342089
index = CategoricalIndex(
20352090
data, categories=grouper.categories, ordered=ordered, name="a"

0 commit comments

Comments
 (0)