Skip to content

BUG: groupby with CategoricalIndex doesn't include unobserved categories #49373

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Nov 7, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,8 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrameGroupBy.sample` raises ``ValueError`` when the object is empty (:issue:`48459`)
- Bug in :meth:`Series.groupby` raises ``ValueError`` when an entry of the index is equal to the name of the index (:issue:`48567`)
- Bug in :meth:`DataFrameGroupBy.resample` produces inconsistent results when passing empty DataFrame (:issue:`47705`)
-
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would not include unobserved categories in result when grouping by categorical indexes (:issue:`49354`)
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would change result order depending on the input index when grouping by categoricals (:issue:`49223`)

Reshaping
^^^^^^^^^
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def recode_for_groupby(
unique_codes = unique1d(c.codes)

take_codes = unique_codes[unique_codes != -1]
if c.ordered:
if c.ordered or sort:
take_codes = np.sort(take_codes)

# we recode according to the uniques
Expand Down
24 changes: 14 additions & 10 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,11 +496,16 @@ def __init__(
# In extant tests, the new self.grouping_vector matches
# `index.get_level_values(ilevel)` whenever
# mapper is None and isinstance(index, MultiIndex)
# TODO: Can you have two levels with the same name?
if isinstance(index, MultiIndex):
index_level = index.get_level_values(ilevel)
else:
index_level = index
(
self.grouping_vector, # Index
self._codes,
self._group_index,
) = index._get_grouper_for_level(mapper, level=ilevel, dropna=dropna)
) = index_level._get_grouper_for_level(mapper, dropna=dropna)

# a passed Grouper like, directly get the grouper in the same way
# as single grouper groupby, use the group_info to get codes
Expand All @@ -524,15 +529,6 @@ def __init__(
# use Index instead of ndarray so we can recover the name
self.grouping_vector = Index(ng, name=newgrouper.result_index.name)

elif is_categorical_dtype(self.grouping_vector):
# a passed Categorical
self._passed_categorical = True

self._orig_cats = self.grouping_vector.categories
self.grouping_vector, self._all_grouper = recode_for_groupby(
self.grouping_vector, sort, observed
)

elif not isinstance(
self.grouping_vector, (Series, Index, ExtensionArray, np.ndarray)
):
Expand Down Expand Up @@ -562,6 +558,14 @@ def __init__(
# TODO 2022-10-08 we only have one test that gets here and
# values are already in nanoseconds in that case.
self.grouping_vector = Series(self.grouping_vector).to_numpy()
elif is_categorical_dtype(self.grouping_vector):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any particular reason this was moved from above?

Copy link
Member Author

@rhshadrach rhshadrach Nov 1, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously, this block was in an if...elif...elif... chain where it would be skipped over when the first if was true (namely, when the grouping is specified by a level in the index). Now it's moved out of that chain, so it's always hit when appropriate.

# a passed Categorical
self._passed_categorical = True

self._orig_cats = self.grouping_vector.categories
self.grouping_vector, self._all_grouper = recode_for_groupby(
self.grouping_vector, sort, observed
)

def __repr__(self) -> str:
return f"Grouping({self.name})"
Expand Down
10 changes: 8 additions & 2 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -871,14 +871,20 @@ def test_apply_multi_level_name(category):
b = [1, 2] * 5
if category:
b = pd.Categorical(b, categories=[1, 2, 3])
expected_index = pd.CategoricalIndex([1, 2], categories=[1, 2, 3], name="B")
expected_index = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3], name="B")
# GH#40669 - summing an empty frame gives float dtype
expected_values = [20.0, 25.0, 0.0]
else:
expected_index = Index([1, 2], name="B")
expected_values = [20, 25]
expected = DataFrame(
{"C": expected_values, "D": expected_values}, index=expected_index
)

df = DataFrame(
{"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))}
).set_index(["A", "B"])
result = df.groupby("B").apply(lambda x: x.sum())
expected = DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index)
tm.assert_frame_equal(result, expected)
assert df.index.names == ["A", "B"]

Expand Down
79 changes: 66 additions & 13 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,60 @@ def test_observed_groups(observed):
tm.assert_dict_equal(result, expected)


@pytest.mark.parametrize(
"keys, expected_values, expected_index_levels",
[
("a", [15, 9, 0], CategoricalIndex([1, 2, 3], name="a")),
(
["a", "b"],
[7, 8, 0, 0, 0, 9, 0, 0, 0],
[CategoricalIndex([1, 2, 3], name="a"), Index([4, 5, 6])],
),
(
["a", "a2"],
[15, 0, 0, 0, 9, 0, 0, 0, 0],
[
CategoricalIndex([1, 2, 3], name="a"),
CategoricalIndex([1, 2, 3], name="a"),
],
),
],
)
@pytest.mark.parametrize("test_series", [True, False])
def test_unobserved_in_index(keys, expected_values, expected_index_levels, test_series):
# GH#49354 - ensure unobserved cats occur when grouping by index levels
df = DataFrame(
{
"a": Categorical([1, 1, 2], categories=[1, 2, 3]),
"a2": Categorical([1, 1, 2], categories=[1, 2, 3]),
"b": [4, 5, 6],
"c": [7, 8, 9],
}
).set_index(["a", "a2"])
if "b" not in keys:
# Only keep b when it is used for grouping for consistent columns in the result
df = df.drop(columns="b")

gb = df.groupby(keys, observed=False)
if test_series:
gb = gb["c"]
result = gb.sum()

if len(keys) == 1:
index = expected_index_levels
else:
codes = [[0, 0, 0, 1, 1, 1, 2, 2, 2], 3 * [0, 1, 2]]
index = MultiIndex(
expected_index_levels,
codes=codes,
names=keys,
)
expected = DataFrame({"c": expected_values}, index=index)
if test_series:
expected = expected["c"]
tm.assert_equal(result, expected)


def test_observed_groups_with_nan(observed):
# GH 24740
df = DataFrame(
Expand Down Expand Up @@ -1235,10 +1289,10 @@ def df_cat(df):
@pytest.mark.parametrize("operation", ["agg", "apply"])
def test_seriesgroupby_observed_true(df_cat, operation):
# GH 24880
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Could you add the GH reference related to why this test changed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

lev_a = Index(["foo", "foo", "bar", "bar"], dtype=df_cat["A"].dtype, name="A")
lev_b = Index(["one", "two", "one", "three"], dtype=df_cat["B"].dtype, name="B")
lev_a = Index(["bar", "bar", "foo", "foo"], dtype=df_cat["A"].dtype, name="A")
lev_b = Index(["one", "three", "one", "two"], dtype=df_cat["B"].dtype, name="B")
index = MultiIndex.from_arrays([lev_a, lev_b])
expected = Series(data=[1, 3, 2, 4], index=index, name="C")
expected = Series(data=[2, 4, 1, 3], index=index, name="C")

grouped = df_cat.groupby(["A", "B"], observed=True)["C"]
result = getattr(grouped, operation)(sum)
Expand Down Expand Up @@ -1272,16 +1326,16 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
True,
MultiIndex.from_arrays(
[
Index(["foo"] * 4 + ["bar"] * 4, dtype="category", name="A"),
Index(["bar"] * 4 + ["foo"] * 4, dtype="category", name="A"),
Index(
["one", "one", "two", "two", "one", "one", "three", "three"],
["one", "one", "three", "three", "one", "one", "two", "two"],
dtype="category",
name="B",
),
Index(["min", "max"] * 4),
]
),
[1, 1, 3, 3, 2, 2, 4, 4],
[2, 2, 4, 4, 1, 1, 3, 3],
),
(
False,
Expand Down Expand Up @@ -1857,7 +1911,7 @@ def test_category_order_reducer(
if (
reduction_func in ("idxmax", "idxmin")
and not observed
and index_kind == "range"
and index_kind != "multi"
):
msg = "GH#10694 - idxmax/min fail with unused categories"
request.node.add_marker(pytest.mark.xfail(reason=msg))
Expand Down Expand Up @@ -2005,10 +2059,13 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde


@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
def test_many_categories(as_index, sort, index_kind, ordered):
def test_many_categories(request, as_index, sort, index_kind, ordered):
# GH#48749 - Test when the grouper has many categories
if index_kind != "range" and not as_index:
pytest.skip(reason="Result doesn't have categories, nothing to test")
if index_kind == "multi" and as_index and not sort and ordered:
msg = "GH#48749 - values are unsorted even though the Categorical is ordered"
request.node.add_marker(pytest.mark.xfail(reason=msg))
categories = np.arange(9999, -1, -1)
grouper = Categorical([2, 1, 2, 3], categories=categories, ordered=ordered)
df = DataFrame({"a": grouper, "b": range(4)})
Expand All @@ -2025,11 +2082,7 @@ def test_many_categories(as_index, sort, index_kind, ordered):
result = gb.sum()

# Test is setup so that data and index are the same values
# TODO: GH#49223 - Order of values should be the same for all index_kinds
if index_kind == "range":
data = [3, 2, 1] if ordered else [2, 1, 3]
else:
data = [3, 2, 1] if sort else [2, 1, 3]
data = [3, 2, 1] if sort or ordered else [2, 1, 3]

index = CategoricalIndex(
data, categories=grouper.categories, ordered=ordered, name="a"
Expand Down