diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index cafbdb731f494..acc01b275c97f 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -36,7 +36,7 @@ Bug Fixes - Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when ``skipfooter`` was not being respected by Python's CSV library (:issue:`13879`) - +- Bug in ``.groupby(..., sort=True)`` of a non-lexsorted MultiIndex when grouping with multiple levels (:issue:`14776`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ea26f5c0d29b8..f449e16686190 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -861,7 +861,17 @@ def reset_identity(values): if isinstance(result, Series): result = result.reindex(ax) else: - result = result.reindex_axis(ax, axis=self.axis) + + # this is a very unfortunate situation + # we have a multi-index that is NOT lexsorted + # and we have a result which is duplicated + # we can't reindex, so we resort to this + # GH 14776 + if isinstance(ax, MultiIndex) and not ax.is_unique: + result = result.take(result.index.get_indexer_for( + ax.values).unique(), axis=self.axis) + else: + result = result.reindex_axis(ax, axis=self.axis) elif self.group_keys: diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 52d1c5c3681e0..37499e09d6dc6 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -4736,6 +4736,25 @@ def test_groupby_multiindex_not_lexsorted(self): result = not_lexsorted_df.groupby('a').mean() tm.assert_frame_equal(expected, result) + # a transforming function should work regardless of sort + # GH 14776 + df = DataFrame({'x': ['a', 'a', 'b', 'a'], + 'y': [1, 1, 2, 2], + 'z': [1, 2, 3, 4]}).set_index(['x', 'y']) + self.assertFalse(df.index.is_lexsorted()) + + for level in [0, 1, [0, 1]]: + for sort in [False, True]: + result = df.groupby(level=level, sort=sort).apply( + DataFrame.drop_duplicates) + expected = df + tm.assert_frame_equal(expected, result) + + result = df.sort_index().groupby(level=level, sort=sort).apply( + DataFrame.drop_duplicates) + expected = df.sort_index() + tm.assert_frame_equal(expected, result) + def test_groupby_levels_and_columns(self): # GH9344, GH9049 idx_names = ['x', 'y']