Skip to content

BUG: groupby reorders categorical categories #49131

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Oct 24, 2022
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ Bug fixes
Categorical
^^^^^^^^^^^
- Bug in :meth:`Categorical.set_categories` losing dtype information (:issue:`48812`)
-
- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`)

Datetimelike
^^^^^^^^^^^^
Expand Down
34 changes: 0 additions & 34 deletions pandas/core/groupby/categorical.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

from typing import TYPE_CHECKING

import numpy as np

from pandas.core.algorithms import unique1d
Expand All @@ -11,9 +9,6 @@
recode_for_categories,
)

if TYPE_CHECKING:
from pandas.core.indexes.api import CategoricalIndex


def recode_for_groupby(
c: Categorical, sort: bool, observed: bool
Expand Down Expand Up @@ -90,32 +85,3 @@ def recode_for_groupby(
take_codes = unique_notnan_codes

return Categorical(c, c.unique().categories.take(take_codes)), None


def recode_from_groupby(
c: Categorical, sort: bool, ci: CategoricalIndex
) -> CategoricalIndex:
"""
Reverse the codes_to_groupby to account for sort / observed.

Parameters
----------
c : Categorical
sort : bool
The value of the sort parameter groupby was called with.
ci : CategoricalIndex
The codes / categories to recode

Returns
-------
CategoricalIndex
"""
# we re-order to the original category orderings
if sort:
# error: "CategoricalIndex" has no attribute "set_categories"
return ci.set_categories(c.categories) # type: ignore[attr-defined]

# we are not sorting, so add unobserved to the end
new_cats = c.categories[~c.categories.isin(ci.categories)]
# error: "CategoricalIndex" has no attribute "add_categories"
return ci.add_categories(new_cats) # type: ignore[attr-defined]
14 changes: 9 additions & 5 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,7 @@
import pandas.core.common as com
from pandas.core.frame import DataFrame
from pandas.core.groupby import ops
from pandas.core.groupby.categorical import (
recode_for_groupby,
recode_from_groupby,
)
from pandas.core.groupby.categorical import recode_for_groupby
from pandas.core.indexes.api import (
CategoricalIndex,
Index,
Expand Down Expand Up @@ -462,6 +459,7 @@ class Grouping:
_group_index: Index | None = None
_passed_categorical: bool
_all_grouper: Categorical | None
_orig_cats: Index | None
_index: Index

def __init__(
Expand All @@ -479,6 +477,7 @@ def __init__(
self._orig_grouper = grouper
self.grouping_vector = _convert_grouper(index, grouper)
self._all_grouper = None
self._orig_cats = None
self._index = index
self._sort = sort
self.obj = obj
Expand Down Expand Up @@ -529,6 +528,7 @@ def __init__(
# a passed Categorical
self._passed_categorical = True

self._orig_cats = self.grouping_vector.categories
self.grouping_vector, self._all_grouper = recode_for_groupby(
self.grouping_vector, sort, observed
)
Expand Down Expand Up @@ -646,7 +646,9 @@ def result_index(self) -> Index:
if self._all_grouper is not None:
group_idx = self.group_index
assert isinstance(group_idx, CategoricalIndex)
return recode_from_groupby(self._all_grouper, self._sort, group_idx)
categories = self._all_grouper.categories
# set_categories is dynamically added
return group_idx.set_categories(categories) # type: ignore[attr-defined]
return self.group_index

@cache_readonly
Expand Down Expand Up @@ -678,6 +680,8 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
uniques = Categorical.from_codes(
codes=ucodes, categories=categories, ordered=cat.ordered
)
if not self._observed:
uniques = uniques.reorder_categories(self._orig_cats)
return cat.codes, uniques

elif isinstance(self.grouping_vector, ops.BaseGrouper):
Expand Down
193 changes: 175 additions & 18 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,7 @@ def test_preserve_categories():
# ordered=False
df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)})
sort_index = CategoricalIndex(categories, categories, ordered=False, name="A")
nosort_index = CategoricalIndex(list("bac"), list("bac"), ordered=False, name="A")
nosort_index = CategoricalIndex(list("bac"), list("abc"), ordered=False, name="A")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a GH reference by these changed tests?

tm.assert_index_equal(
df.groupby("A", sort=True, observed=False).first().index, sort_index
)
Expand Down Expand Up @@ -965,7 +965,7 @@ def test_sort2():

index = CategoricalIndex(
["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"],
categories=["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"],
categories=["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"],
name="range",
)
expected_nosort = DataFrame(
Expand Down Expand Up @@ -1042,27 +1042,34 @@ def test_sort_datetimelike():

# ordered = False
df["dt"] = Categorical(df["dt"], ordered=False)
index = [
datetime(2011, 1, 1),
datetime(2011, 2, 1),
datetime(2011, 5, 1),
datetime(2011, 7, 1),
]
sort_index = CategoricalIndex(
[
datetime(2011, 1, 1),
datetime(2011, 2, 1),
datetime(2011, 5, 1),
datetime(2011, 7, 1),
],
name="dt",
)
result_sort = DataFrame(
[[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"]
[[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=sort_index
)
result_sort.index = CategoricalIndex(index, name="dt")

index = [
datetime(2011, 7, 1),
datetime(2011, 2, 1),
datetime(2011, 5, 1),
datetime(2011, 1, 1),
]
nosort_index = CategoricalIndex(
[
datetime(2011, 7, 1),
datetime(2011, 2, 1),
datetime(2011, 5, 1),
datetime(2011, 1, 1),
],
categories=sort_index.categories,
name="dt",
)
result_nosort = DataFrame(
[[10, 10], [5, 30], [6, 40], [1, 60]], columns=["foo", "bar"]
[[10, 10], [5, 30], [6, 40], [1, 60]],
columns=["foo", "bar"],
index=nosort_index,
)
result_nosort.index = CategoricalIndex(index, categories=index, name="dt")

col = "dt"
tm.assert_frame_equal(
Expand Down Expand Up @@ -1845,3 +1852,153 @@ def test_groupby_categorical_dropna(observed, dropna):
expected.index.name = "x"

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
@pytest.mark.parametrize("ordered", [True, False])
def test_category_order_reducer(
request, as_index, sort, observed, reduction_func, index_kind, ordered
):
# GH#48749
if (
reduction_func in ("idxmax", "idxmin")
and not observed
and index_kind == "range"
):
msg = "GH#10694 - idxmax/min fail with unused categories"
request.node.add_marker(pytest.mark.xfail(reason=msg))
elif index_kind != "range" and not as_index:
pytest.skip(reason="Result doesn't have categories, nothing to test")
df = DataFrame(
{
"a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
"b": range(4),
}
)
if index_kind == "range":
keys = ["a"]
elif index_kind == "single":
keys = ["a"]
df = df.set_index(keys)
elif index_kind == "multi":
keys = ["a", "a2"]
df["a2"] = df["a"]
df = df.set_index(keys)
args = get_groupby_method_args(reduction_func, df)
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
op_result = getattr(gb, reduction_func)(*args)
if as_index:
result = op_result.index.get_level_values("a").categories
else:
result = op_result["a"].cat.categories
expected = Index([1, 4, 3, 2])
tm.assert_index_equal(result, expected)

if index_kind == "multi":
result = op_result.index.get_level_values("a2").categories
tm.assert_index_equal(result, expected)


@pytest.mark.parametrize("index_kind", ["single", "multi"])
@pytest.mark.parametrize("ordered", [True, False])
def test_category_order_transformer(
as_index, sort, observed, transformation_func, index_kind, ordered
):
# GH#48749
df = DataFrame(
{
"a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
"b": range(4),
}
)
if index_kind == "single":
keys = ["a"]
df = df.set_index(keys)
elif index_kind == "multi":
keys = ["a", "a2"]
df["a2"] = df["a"]
df = df.set_index(keys)
args = get_groupby_method_args(transformation_func, df)
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
op_result = getattr(gb, transformation_func)(*args)
result = op_result.index.get_level_values("a").categories
expected = Index([1, 4, 3, 2])
tm.assert_index_equal(result, expected)

if index_kind == "multi":
result = op_result.index.get_level_values("a2").categories
tm.assert_index_equal(result, expected)


@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
@pytest.mark.parametrize("method", ["head", "tail"])
@pytest.mark.parametrize("ordered", [True, False])
def test_category_order_head_tail(
as_index, sort, observed, method, index_kind, ordered
):
# GH#48749
df = DataFrame(
{
"a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
"b": range(4),
}
)
if index_kind == "range":
keys = ["a"]
elif index_kind == "single":
keys = ["a"]
df = df.set_index(keys)
elif index_kind == "multi":
keys = ["a", "a2"]
df["a2"] = df["a"]
df = df.set_index(keys)
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
op_result = getattr(gb, method)()
if index_kind == "range":
result = op_result["a"].cat.categories
else:
result = op_result.index.get_level_values("a").categories
expected = Index([1, 4, 3, 2])
tm.assert_index_equal(result, expected)

if index_kind == "multi":
result = op_result.index.get_level_values("a2").categories
tm.assert_index_equal(result, expected)


@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
@pytest.mark.parametrize("ordered", [True, False])
def test_category_order_apply(as_index, sort, observed, method, index_kind, ordered):
# GH#48749
if (method == "transform" and index_kind == "range") or (
not as_index and index_kind != "range"
):
pytest.skip("No categories in result, nothing to test")
df = DataFrame(
{
"a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
"b": range(4),
}
)
if index_kind == "range":
keys = ["a"]
elif index_kind == "single":
keys = ["a"]
df = df.set_index(keys)
elif index_kind == "multi":
keys = ["a", "a2"]
df["a2"] = df["a"]
df = df.set_index(keys)
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
op_result = getattr(gb, method)(lambda x: x.sum())
if (method == "transform" or not as_index) and index_kind == "range":
result = op_result["a"].cat.categories
else:
result = op_result.index.get_level_values("a").categories
expected = Index([1, 4, 3, 2])
tm.assert_index_equal(result, expected)

if index_kind == "multi":
result = op_result.index.get_level_values("a2").categories
tm.assert_index_equal(result, expected)
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ def test_no_sort_keep_na(request, sequence_index, dtype, test_series):
if dtype == "category":
index = pd.CategoricalIndex(
[uniques[e] for e in summed],
list({uniques[k]: 0 for k in sequence if not pd.isnull(uniques[k])}),
df["key"].cat.categories,
name="key",
)
elif isinstance(dtype, str) and dtype.startswith("Sparse"):
Expand Down