Skip to content

Commit 9c05918

Browse files
authored
BUG: groupby with sort=False still sorts an ordered categorical (#49613)
* BUG: groupby with sort=False still sorts an ordered categorical * Add versionchanged
1 parent f82b1c6 commit 9c05918

File tree

6 files changed

+55
-118
lines changed

6 files changed

+55
-118
lines changed

doc/source/whatsnew/v2.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,8 @@ Groupby/resample/rolling
651651
- Bug in :meth:`DataFrameGroupBy.resample` produces inconsistent results when passing empty DataFrame (:issue:`47705`)
652652
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would not include unobserved categories in result when grouping by categorical indexes (:issue:`49354`)
653653
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would change result order depending on the input index when grouping by categoricals (:issue:`49223`)
654+
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` when grouping on categorical data would sort result values even when used with ``sort=False`` (:issue:`42482`)
655+
-
654656

655657
Reshaping
656658
^^^^^^^^^

pandas/core/groupby/categorical.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def recode_for_groupby(
5353
unique_codes = unique1d(c.codes)
5454

5555
take_codes = unique_codes[unique_codes != -1]
56-
if c.ordered or sort:
56+
if sort:
5757
take_codes = np.sort(take_codes)
5858

5959
# we recode according to the uniques
@@ -75,7 +75,7 @@ def recode_for_groupby(
7575
all_codes = np.arange(c.categories.nunique())
7676
# GH 38140: exclude nan from indexer for categories
7777
unique_notnan_codes = unique1d(c.codes[c.codes != -1])
78-
if c.ordered:
78+
if sort:
7979
unique_notnan_codes = np.sort(unique_notnan_codes)
8080
if len(all_codes) > len(unique_notnan_codes):
8181
# GH 13179: All categories need to be present, even if missing from the data

pandas/core/groupby/groupby.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -4116,7 +4116,9 @@ def _reindex_output(
41164116
# "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index"
41174117
levels_list.append(qs) # type: ignore[arg-type]
41184118
names = names + [None]
4119-
index, _ = MultiIndex.from_product(levels_list, names=names).sortlevel()
4119+
index = MultiIndex.from_product(levels_list, names=names)
4120+
if self.sort:
4121+
index = index.sortlevel()[0]
41204122

41214123
if self.as_index:
41224124
# Always holds for SeriesGroupBy unless GH#36507 is implemented

pandas/core/groupby/grouper.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -655,7 +655,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
655655
if self._observed:
656656
ucodes = algorithms.unique1d(cat.codes)
657657
ucodes = ucodes[ucodes != -1]
658-
if self._sort or cat.ordered:
658+
if self._sort:
659659
ucodes = np.sort(ucodes)
660660
else:
661661
ucodes = np.arange(len(categories))

pandas/core/shared_docs.py

+6
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,12 @@
119119
Sort group keys. Get better performance by turning this off.
120120
Note this does not influence the order of observations within each
121121
group. Groupby preserves the order of rows within each group.
122+
123+
.. versionchanged:: 2.0.0
124+
125+
Specifying ``sort=False`` with an ordered categorical grouper will no
126+
longer sort the values.
127+
122128
group_keys : bool, optional
123129
When calling apply and the ``by`` argument produces a like-indexed
124130
(i.e. :ref:`a transform <groupby.transform>`) result, add group keys to

pandas/tests/groupby/test_categorical.py

+41-114
Original file line numberDiff line numberDiff line change
@@ -818,12 +818,14 @@ def test_preserve_categories():
818818

819819
# ordered=True
820820
df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=True)})
821-
index = CategoricalIndex(categories, categories, ordered=True, name="A")
821+
sort_index = CategoricalIndex(categories, categories, ordered=True, name="A")
822+
nosort_index = CategoricalIndex(list("bac"), categories, ordered=True, name="A")
822823
tm.assert_index_equal(
823-
df.groupby("A", sort=True, observed=False).first().index, index
824+
df.groupby("A", sort=True, observed=False).first().index, sort_index
824825
)
826+
# GH#42482 - don't sort result when sort=False, even when ordered=True
825827
tm.assert_index_equal(
826-
df.groupby("A", sort=False, observed=False).first().index, index
828+
df.groupby("A", sort=False, observed=False).first().index, nosort_index
827829
)
828830

829831
# ordered=False
@@ -972,8 +974,11 @@ def test_sort():
972974
tm.assert_series_equal(res, exp)
973975

974976

975-
def test_sort2():
977+
@pytest.mark.parametrize("ordered", [True, False])
978+
def test_sort2(sort, ordered):
976979
# dataframe groupby sort was being ignored # GH 8868
980+
# GH#48749 - don't change order of categories
981+
# GH#42482 - don't sort result when sort=False, even when ordered=True
977982
df = DataFrame(
978983
[
979984
["(7.5, 10]", 10, 10],
@@ -986,53 +991,28 @@ def test_sort2():
986991
],
987992
columns=["range", "foo", "bar"],
988993
)
989-
df["range"] = Categorical(df["range"], ordered=True)
990-
index = CategoricalIndex(
991-
["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], name="range", ordered=True
992-
)
993-
expected_sort = DataFrame(
994-
[[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=index
995-
)
996-
997-
col = "range"
998-
result_sort = df.groupby(col, sort=True, observed=False).first()
999-
tm.assert_frame_equal(result_sort, expected_sort)
1000-
1001-
# when categories is ordered, group is ordered by category's order
1002-
expected_sort = result_sort
1003-
result_sort = df.groupby(col, sort=False, observed=False).first()
1004-
tm.assert_frame_equal(result_sort, expected_sort)
994+
df["range"] = Categorical(df["range"], ordered=ordered)
995+
result = df.groupby("range", sort=sort, observed=False).first()
1005996

1006-
df["range"] = Categorical(df["range"], ordered=False)
1007-
index = CategoricalIndex(
1008-
["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], name="range"
1009-
)
1010-
expected_sort = DataFrame(
1011-
[[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=index
1012-
)
1013-
1014-
index = CategoricalIndex(
1015-
["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"],
1016-
# GH#48749 - don't change order of categories
1017-
categories=["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"],
1018-
name="range",
1019-
)
1020-
expected_nosort = DataFrame(
1021-
[[10, 10], [5, 30], [6, 40], [1, 60]], index=index, columns=["foo", "bar"]
997+
if sort:
998+
data_values = [[1, 60], [5, 30], [6, 40], [10, 10]]
999+
index_values = ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"]
1000+
else:
1001+
data_values = [[10, 10], [5, 30], [6, 40], [1, 60]]
1002+
index_values = ["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"]
1003+
expected = DataFrame(
1004+
data_values,
1005+
columns=["foo", "bar"],
1006+
index=CategoricalIndex(index_values, name="range", ordered=ordered),
10221007
)
10231008

1024-
col = "range"
1025-
1026-
# this is an unordered categorical, but we allow this ####
1027-
result_sort = df.groupby(col, sort=True, observed=False).first()
1028-
tm.assert_frame_equal(result_sort, expected_sort)
1029-
1030-
result_nosort = df.groupby(col, sort=False, observed=False).first()
1031-
tm.assert_frame_equal(result_nosort, expected_nosort)
1009+
tm.assert_frame_equal(result, expected)
10321010

10331011

1034-
def test_sort_datetimelike():
1012+
@pytest.mark.parametrize("ordered", [True, False])
1013+
def test_sort_datetimelike(sort, ordered):
10351014
# GH10505
1015+
# GH#42482 - don't sort result when sort=False, even when ordered=True
10361016

10371017
# use same data as test_groupby_sort_categorical, which category is
10381018
# corresponding to datetime.month
@@ -1054,80 +1034,30 @@ def test_sort_datetimelike():
10541034
)
10551035

10561036
# ordered=True
1057-
df["dt"] = Categorical(df["dt"], ordered=True)
1058-
index = [
1059-
datetime(2011, 1, 1),
1060-
datetime(2011, 2, 1),
1061-
datetime(2011, 5, 1),
1062-
datetime(2011, 7, 1),
1063-
]
1064-
result_sort = DataFrame(
1065-
[[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"]
1066-
)
1067-
result_sort.index = CategoricalIndex(index, name="dt", ordered=True)
1068-
1069-
index = [
1070-
datetime(2011, 7, 1),
1071-
datetime(2011, 2, 1),
1072-
datetime(2011, 5, 1),
1073-
datetime(2011, 1, 1),
1074-
]
1075-
result_nosort = DataFrame(
1076-
[[10, 10], [5, 30], [6, 40], [1, 60]], columns=["foo", "bar"]
1077-
)
1078-
result_nosort.index = CategoricalIndex(
1079-
index, categories=index, name="dt", ordered=True
1080-
)
1081-
1082-
col = "dt"
1083-
tm.assert_frame_equal(
1084-
result_sort, df.groupby(col, sort=True, observed=False).first()
1085-
)
1086-
1087-
# when categories is ordered, group is ordered by category's order
1088-
tm.assert_frame_equal(
1089-
result_sort, df.groupby(col, sort=False, observed=False).first()
1090-
)
1091-
1092-
# ordered = False
1093-
df["dt"] = Categorical(df["dt"], ordered=False)
1094-
sort_index = CategoricalIndex(
1095-
[
1037+
df["dt"] = Categorical(df["dt"], ordered=ordered)
1038+
if sort:
1039+
data_values = [[1, 60], [5, 30], [6, 40], [10, 10]]
1040+
index_values = [
10961041
datetime(2011, 1, 1),
10971042
datetime(2011, 2, 1),
10981043
datetime(2011, 5, 1),
10991044
datetime(2011, 7, 1),
1100-
],
1101-
name="dt",
1102-
)
1103-
result_sort = DataFrame(
1104-
[[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=sort_index
1105-
)
1106-
1107-
nosort_index = CategoricalIndex(
1108-
[
1045+
]
1046+
else:
1047+
data_values = [[10, 10], [5, 30], [6, 40], [1, 60]]
1048+
index_values = [
11091049
datetime(2011, 7, 1),
11101050
datetime(2011, 2, 1),
11111051
datetime(2011, 5, 1),
11121052
datetime(2011, 1, 1),
1113-
],
1114-
# GH#48749 - don't change order of categories
1115-
categories=sort_index.categories,
1116-
name="dt",
1117-
)
1118-
result_nosort = DataFrame(
1119-
[[10, 10], [5, 30], [6, 40], [1, 60]],
1053+
]
1054+
expected = DataFrame(
1055+
data_values,
11201056
columns=["foo", "bar"],
1121-
index=nosort_index,
1122-
)
1123-
1124-
col = "dt"
1125-
tm.assert_frame_equal(
1126-
result_sort, df.groupby(col, sort=True, observed=False).first()
1127-
)
1128-
tm.assert_frame_equal(
1129-
result_nosort, df.groupby(col, sort=False, observed=False).first()
1057+
index=CategoricalIndex(index_values, name="dt", ordered=ordered),
11301058
)
1059+
result = df.groupby("dt", sort=sort, observed=False).first()
1060+
tm.assert_frame_equal(result, expected)
11311061

11321062

11331063
def test_empty_sum():
@@ -2055,13 +1985,10 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde
20551985

20561986

20571987
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
2058-
def test_many_categories(request, as_index, sort, index_kind, ordered):
1988+
def test_many_categories(as_index, sort, index_kind, ordered):
20591989
# GH#48749 - Test when the grouper has many categories
20601990
if index_kind != "range" and not as_index:
20611991
pytest.skip(reason="Result doesn't have categories, nothing to test")
2062-
if index_kind == "multi" and as_index and not sort and ordered:
2063-
msg = "GH#48749 - values are unsorted even though the Categorical is ordered"
2064-
request.node.add_marker(pytest.mark.xfail(reason=msg))
20651992
categories = np.arange(9999, -1, -1)
20661993
grouper = Categorical([2, 1, 2, 3], categories=categories, ordered=ordered)
20671994
df = DataFrame({"a": grouper, "b": range(4)})
@@ -2078,7 +2005,7 @@ def test_many_categories(request, as_index, sort, index_kind, ordered):
20782005
result = gb.sum()
20792006

20802007
# Test is setup so that data and index are the same values
2081-
data = [3, 2, 1] if sort or ordered else [2, 1, 3]
2008+
data = [3, 2, 1] if sort else [2, 1, 3]
20822009

20832010
index = CategoricalIndex(
20842011
data, categories=grouper.categories, ordered=ordered, name="a"

0 commit comments

Comments
 (0)