Skip to content

Commit dbf8aaf

Browse files
authored
BUG: GroupBy.value_counts sorting order (#56016)
* BUG: GroupBy.value_counts sorting order * Whatsnew * cleanup * Fix categorical and add test * cleanup
1 parent c95e943 commit dbf8aaf

File tree

3 files changed

+93
-19
lines changed

3 files changed

+93
-19
lines changed

doc/source/whatsnew/v2.2.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,9 @@ Groupby/resample/rolling
446446
- Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
447447
- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
448448
- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`)
449+
- Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`56007`)
450+
- Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`56007`)
451+
- Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`56007`)
449452

450453
Reshaping
451454
^^^^^^^^^

pandas/core/groupby/groupby.py

+18-9
Original file line numberDiff line numberDiff line change
@@ -2821,11 +2821,27 @@ def _value_counts(
28212821
for grouping in groupings
28222822
):
28232823
levels_list = [ping.result_index for ping in groupings]
2824-
multi_index, _ = MultiIndex.from_product(
2824+
multi_index = MultiIndex.from_product(
28252825
levels_list, names=[ping.name for ping in groupings]
2826-
).sortlevel()
2826+
)
28272827
result_series = result_series.reindex(multi_index, fill_value=0)
28282828

2829+
if sort:
2830+
# Sort by the values
2831+
result_series = result_series.sort_values(
2832+
ascending=ascending, kind="stable"
2833+
)
2834+
if self.sort:
2835+
# Sort by the groupings
2836+
names = result_series.index.names
2837+
# GH#56007 - Temporarily replace names in case they are integers
2838+
result_series.index.names = range(len(names))
2839+
index_level = list(range(len(self.grouper.groupings)))
2840+
result_series = result_series.sort_index(
2841+
level=index_level, sort_remaining=False
2842+
)
2843+
result_series.index.names = names
2844+
28292845
if normalize:
28302846
# Normalize the results by dividing by the original group sizes.
28312847
# We are guaranteed to have the first N levels be the
@@ -2845,13 +2861,6 @@ def _value_counts(
28452861
# Handle groups of non-observed categories
28462862
result_series = result_series.fillna(0.0)
28472863

2848-
if sort:
2849-
# Sort the values and then resort by the main grouping
2850-
index_level = range(len(self.grouper.groupings))
2851-
result_series = result_series.sort_values(ascending=ascending).sort_index(
2852-
level=index_level, sort_remaining=False
2853-
)
2854-
28552864
result: Series | DataFrame
28562865
if self.as_index:
28572866
result = result_series

pandas/tests/groupby/methods/test_value_counts.py

+72-10
Original file line numberDiff line numberDiff line change
@@ -385,8 +385,8 @@ def test_against_frame_and_seriesgroupby(
385385
"sort, ascending, expected_rows, expected_count, expected_group_size",
386386
[
387387
(False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]),
388-
(True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]),
389-
(True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]),
388+
(True, False, [3, 0, 1, 2, 4], [2, 1, 1, 1, 1], [3, 1, 3, 1, 1]),
389+
(True, True, [0, 1, 2, 4, 3], [1, 1, 1, 1, 2], [1, 3, 1, 1, 3]),
390390
],
391391
)
392392
def test_compound(
@@ -811,19 +811,19 @@ def test_categorical_single_grouper_observed_false(
811811
("FR", "female", "high"),
812812
("FR", "male", "medium"),
813813
("FR", "female", "low"),
814-
("FR", "male", "high"),
815814
("FR", "female", "medium"),
815+
("FR", "male", "high"),
816816
("US", "female", "high"),
817817
("US", "male", "low"),
818-
("US", "male", "medium"),
819-
("US", "male", "high"),
820-
("US", "female", "medium"),
821818
("US", "female", "low"),
822-
("ASIA", "male", "low"),
823-
("ASIA", "male", "high"),
824-
("ASIA", "female", "medium"),
825-
("ASIA", "female", "low"),
819+
("US", "female", "medium"),
820+
("US", "male", "high"),
821+
("US", "male", "medium"),
826822
("ASIA", "female", "high"),
823+
("ASIA", "female", "low"),
824+
("ASIA", "female", "medium"),
825+
("ASIA", "male", "high"),
826+
("ASIA", "male", "low"),
827827
("ASIA", "male", "medium"),
828828
]
829829

@@ -1177,3 +1177,65 @@ def test_value_counts_integer_columns():
11771177
{1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"], "count": 1}
11781178
)
11791179
tm.assert_frame_equal(result, expected)
1180+
1181+
1182+
@pytest.mark.parametrize("vc_sort", [True, False])
1183+
@pytest.mark.parametrize("normalize", [True, False])
1184+
def test_value_counts_sort(sort, vc_sort, normalize):
1185+
# GH#55951
1186+
df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]})
1187+
gb = df.groupby("a", sort=sort)
1188+
result = gb.value_counts(sort=vc_sort, normalize=normalize)
1189+
1190+
if normalize:
1191+
values = [2 / 3, 1 / 3, 1.0]
1192+
else:
1193+
values = [2, 1, 1]
1194+
index = MultiIndex(
1195+
levels=[[1, 2], [3, 4]], codes=[[0, 0, 1], [0, 1, 0]], names=["a", 0]
1196+
)
1197+
expected = Series(values, index=index, name="proportion" if normalize else "count")
1198+
if sort and vc_sort:
1199+
taker = [0, 1, 2]
1200+
elif sort and not vc_sort:
1201+
taker = [0, 1, 2]
1202+
elif not sort and vc_sort:
1203+
taker = [0, 2, 1]
1204+
else:
1205+
taker = [2, 1, 0]
1206+
expected = expected.take(taker)
1207+
1208+
tm.assert_series_equal(result, expected)
1209+
1210+
1211+
@pytest.mark.parametrize("vc_sort", [True, False])
1212+
@pytest.mark.parametrize("normalize", [True, False])
1213+
def test_value_counts_sort_categorical(sort, vc_sort, normalize):
1214+
# GH#55951
1215+
df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}, dtype="category")
1216+
gb = df.groupby("a", sort=sort, observed=True)
1217+
result = gb.value_counts(sort=vc_sort, normalize=normalize)
1218+
1219+
if normalize:
1220+
values = [2 / 3, 1 / 3, 1.0, 0.0]
1221+
else:
1222+
values = [2, 1, 1, 0]
1223+
name = "proportion" if normalize else "count"
1224+
expected = DataFrame(
1225+
{
1226+
"a": Categorical([1, 1, 2, 2]),
1227+
0: Categorical([3, 4, 3, 4]),
1228+
name: values,
1229+
}
1230+
).set_index(["a", 0])[name]
1231+
if sort and vc_sort:
1232+
taker = [0, 1, 2, 3]
1233+
elif sort and not vc_sort:
1234+
taker = [0, 1, 2, 3]
1235+
elif not sort and vc_sort:
1236+
taker = [0, 2, 1, 3]
1237+
else:
1238+
taker = [2, 3, 0, 1]
1239+
expected = expected.take(taker)
1240+
1241+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)