Skip to content

Commit fd336fb

Browse files
Licht-Tjreback
authored andcommitted
BUG: Fix unexpected sort in groupby (#17621)
1 parent baadad7 commit fd336fb

File tree

6 files changed

+76
-30
lines changed

6 files changed

+76
-30
lines changed

doc/source/whatsnew/v0.21.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,7 @@ Groupby/Resample/Rolling
627627
- Bug in ``.rolling(...).apply(...)`` with a ``DataFrame`` with a ``DatetimeIndex``, a ``window`` of a timedelta-convertible and ``min_periods >= 1` (:issue:`15305`)
628628
- Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`)
629629
- Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`)
630+
- Bug in ``DataFrame.groupby`` where a single level selection from a ``MultiIndex`` unexpectedly sorts (:issue:`17537`)
630631

631632
Sparse
632633
^^^^^^

pandas/core/generic.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6631,7 +6631,7 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
66316631
return rs
66326632

66336633
def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs):
6634-
grouped = self.groupby(level=level, axis=axis)
6634+
grouped = self.groupby(level=level, axis=axis, sort=False)
66356635
if hasattr(grouped, name) and skipna:
66366636
return getattr(grouped, name)(**kwargs)
66376637
axis = self._get_axis_number(axis)

pandas/core/groupby.py

+21-2
Original file line numberDiff line numberDiff line change
@@ -2586,10 +2586,27 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
25862586
"""
25872587
group_axis = obj._get_axis(axis)
25882588

2589-
# validate that the passed level is compatible with the passed
2589+
# validate that the passed single level is compatible with the passed
25902590
# axis of the object
25912591
if level is not None:
2592-
if not isinstance(group_axis, MultiIndex):
2592+
# TODO: These if-block and else-block are almost same.
2593+
# MultiIndex instance check is removable, but it seems that there are
2594+
# some processes only for non-MultiIndex in else-block,
2595+
# eg. `obj.index.name != level`. We have to consider carefully whether
2596+
# these are applicable for MultiIndex. Even if these are applicable,
2597+
# we need to check if it makes no side effect to subsequent processes
2598+
# on the outside of this condition.
2599+
# (GH 17621)
2600+
if isinstance(group_axis, MultiIndex):
2601+
if is_list_like(level) and len(level) == 1:
2602+
level = level[0]
2603+
2604+
if key is None and is_scalar(level):
2605+
# Get the level values from group_axis
2606+
key = group_axis.get_level_values(level)
2607+
level = None
2608+
2609+
else:
25932610
# allow level to be a length-one list-like object
25942611
# (e.g., level=[0])
25952612
# GH 13901
@@ -2611,6 +2628,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
26112628
raise ValueError('level > 0 or level < -1 only valid with '
26122629
' MultiIndex')
26132630

2631+
# NOTE: `group_axis` and `group_axis.get_level_values(level)`
2632+
# are same in this section.
26142633
level = None
26152634
key = group_axis
26162635

pandas/tests/groupby/test_groupby.py

+28-19
Original file line numberDiff line numberDiff line change
@@ -1791,18 +1791,20 @@ def aggfun(ser):
17911791
agged2 = df.groupby(keys).aggregate(aggfun)
17921792
assert len(agged2.columns) + 1 == len(df.columns)
17931793

1794-
def test_groupby_level(self):
1794+
@pytest.mark.parametrize('sort', [True, False])
1795+
def test_groupby_level(self, sort):
1796+
# GH 17537
17951797
frame = self.mframe
17961798
deleveled = frame.reset_index()
17971799

1798-
result0 = frame.groupby(level=0).sum()
1799-
result1 = frame.groupby(level=1).sum()
1800+
result0 = frame.groupby(level=0, sort=sort).sum()
1801+
result1 = frame.groupby(level=1, sort=sort).sum()
18001802

1801-
expected0 = frame.groupby(deleveled['first'].values).sum()
1802-
expected1 = frame.groupby(deleveled['second'].values).sum()
1803+
expected0 = frame.groupby(deleveled['first'].values, sort=sort).sum()
1804+
expected1 = frame.groupby(deleveled['second'].values, sort=sort).sum()
18031805

1804-
expected0 = expected0.reindex(frame.index.levels[0])
1805-
expected1 = expected1.reindex(frame.index.levels[1])
1806+
expected0.index.name = 'first'
1807+
expected1.index.name = 'second'
18061808

18071809
assert result0.index.name == 'first'
18081810
assert result1.index.name == 'second'
@@ -1813,15 +1815,15 @@ def test_groupby_level(self):
18131815
assert result1.index.name == frame.index.names[1]
18141816

18151817
# groupby level name
1816-
result0 = frame.groupby(level='first').sum()
1817-
result1 = frame.groupby(level='second').sum()
1818+
result0 = frame.groupby(level='first', sort=sort).sum()
1819+
result1 = frame.groupby(level='second', sort=sort).sum()
18181820
assert_frame_equal(result0, expected0)
18191821
assert_frame_equal(result1, expected1)
18201822

18211823
# axis=1
18221824

1823-
result0 = frame.T.groupby(level=0, axis=1).sum()
1824-
result1 = frame.T.groupby(level=1, axis=1).sum()
1825+
result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum()
1826+
result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum()
18251827
assert_frame_equal(result0, expected0.T)
18261828
assert_frame_equal(result1, expected1.T)
18271829

@@ -1835,15 +1837,17 @@ def test_groupby_level_index_names(self):
18351837
df.groupby(level='exp')
18361838
pytest.raises(ValueError, df.groupby, level='foo')
18371839

1838-
def test_groupby_level_with_nas(self):
1840+
@pytest.mark.parametrize('sort', [True, False])
1841+
def test_groupby_level_with_nas(self, sort):
1842+
# GH 17537
18391843
index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
18401844
labels=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1,
18411845
2, 3]])
18421846

18431847
# factorizing doesn't confuse things
18441848
s = Series(np.arange(8.), index=index)
1845-
result = s.groupby(level=0).sum()
1846-
expected = Series([22., 6.], index=[1, 0])
1849+
result = s.groupby(level=0, sort=sort).sum()
1850+
expected = Series([6., 22.], index=[0, 1])
18471851
assert_series_equal(result, expected)
18481852

18491853
index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
@@ -1852,8 +1856,8 @@ def test_groupby_level_with_nas(self):
18521856

18531857
# factorizing doesn't confuse things
18541858
s = Series(np.arange(8.), index=index)
1855-
result = s.groupby(level=0).sum()
1856-
expected = Series([18., 6.], index=[1, 0])
1859+
result = s.groupby(level=0, sort=sort).sum()
1860+
expected = Series([6., 18.], index=[0.0, 1.0])
18571861
assert_series_equal(result, expected)
18581862

18591863
def test_groupby_level_apply(self):
@@ -1936,9 +1940,14 @@ def test_groupby_complex(self):
19361940
result = a.sum(level=0)
19371941
assert_series_equal(result, expected)
19381942

1939-
def test_level_preserve_order(self):
1940-
grouped = self.mframe.groupby(level=0)
1941-
exp_labels = np.array([0, 0, 0, 1, 1, 2, 2, 3, 3, 3], np.intp)
1943+
@pytest.mark.parametrize('sort,labels', [
1944+
[True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
1945+
[False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]]
1946+
])
1947+
def test_level_preserve_order(self, sort, labels):
1948+
# GH 17537
1949+
grouped = self.mframe.groupby(level=0, sort=sort)
1950+
exp_labels = np.array(labels, np.intp)
19421951
assert_almost_equal(grouped.grouper.labels[0], exp_labels)
19431952

19441953
def test_grouping_labels(self):

pandas/tests/groupby/test_whitelist.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -174,12 +174,16 @@ def raw_frame():
174174

175175

176176
@pytest.mark.parametrize(
177-
"op, level, axis, skipna",
177+
"op, level, axis, skipna, sort",
178178
product(AGG_FUNCTIONS,
179179
lrange(2), lrange(2),
180+
[True, False],
180181
[True, False]))
181-
def test_regression_whitelist_methods(raw_frame, op, level, axis, skipna):
182+
def test_regression_whitelist_methods(
183+
raw_frame, op, level,
184+
axis, skipna, sort):
182185
# GH6944
186+
# GH 17537
183187
# explicity test the whitelest methods
184188

185189
if axis == 0:
@@ -188,15 +192,19 @@ def test_regression_whitelist_methods(raw_frame, op, level, axis, skipna):
188192
frame = raw_frame.T
189193

190194
if op in AGG_FUNCTIONS_WITH_SKIPNA:
191-
grouped = frame.groupby(level=level, axis=axis)
195+
grouped = frame.groupby(level=level, axis=axis, sort=sort)
192196
result = getattr(grouped, op)(skipna=skipna)
193197
expected = getattr(frame, op)(level=level, axis=axis,
194198
skipna=skipna)
199+
if sort:
200+
expected = expected.sort_index(axis=axis, level=level)
195201
tm.assert_frame_equal(result, expected)
196202
else:
197-
grouped = frame.groupby(level=level, axis=axis)
203+
grouped = frame.groupby(level=level, axis=axis, sort=sort)
198204
result = getattr(grouped, op)()
199205
expected = getattr(frame, op)(level=level, axis=axis)
206+
if sort:
207+
expected = expected.sort_index(axis=axis, level=level)
200208
tm.assert_frame_equal(result, expected)
201209

202210

pandas/tests/test_multilevel.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -1392,17 +1392,23 @@ def test_count(self):
13921392
AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew',
13931393
'mad', 'std', 'var', 'sem']
13941394

1395-
def test_series_group_min_max(self):
1395+
@pytest.mark.parametrize('sort', [True, False])
1396+
def test_series_group_min_max(self, sort):
1397+
# GH 17537
13961398
for op, level, skipna in cart_product(self.AGG_FUNCTIONS, lrange(2),
13971399
[False, True]):
1398-
grouped = self.series.groupby(level=level)
1400+
grouped = self.series.groupby(level=level, sort=sort)
13991401
aggf = lambda x: getattr(x, op)(skipna=skipna)
14001402
# skipna=True
14011403
leftside = grouped.agg(aggf)
14021404
rightside = getattr(self.series, op)(level=level, skipna=skipna)
1405+
if sort:
1406+
rightside = rightside.sort_index(level=level)
14031407
tm.assert_series_equal(leftside, rightside)
14041408

1405-
def test_frame_group_ops(self):
1409+
@pytest.mark.parametrize('sort', [True, False])
1410+
def test_frame_group_ops(self, sort):
1411+
# GH 17537
14061412
self.frame.iloc[1, [1, 2]] = np.nan
14071413
self.frame.iloc[7, [0, 1]] = np.nan
14081414

@@ -1415,7 +1421,7 @@ def test_frame_group_ops(self):
14151421
else:
14161422
frame = self.frame.T
14171423

1418-
grouped = frame.groupby(level=level, axis=axis)
1424+
grouped = frame.groupby(level=level, axis=axis, sort=sort)
14191425

14201426
pieces = []
14211427

@@ -1426,6 +1432,9 @@ def aggf(x):
14261432
leftside = grouped.agg(aggf)
14271433
rightside = getattr(frame, op)(level=level, axis=axis,
14281434
skipna=skipna)
1435+
if sort:
1436+
rightside = rightside.sort_index(level=level, axis=axis)
1437+
frame = frame.sort_index(level=level, axis=axis)
14291438

14301439
# for good measure, groupby detail
14311440
level_index = frame._get_axis(axis).levels[level]

0 commit comments

Comments
 (0)