diff --git a/doc/source/release.rst b/doc/source/release.rst index 331a578c5c349..5e5fb929bd0ca 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -600,6 +600,8 @@ Bug Fixes - Fixed bug where inplace setting of levels or labels on ``MultiIndex`` would not clear cached ``values`` property and therefore return wrong ``values``. (:issue:`5215`) + - Fixed bug where filtering a grouped DataFrame or Series did not maintain + the original ordering (:issue:`4621`). pandas 0.12.0 ------------- diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 22857449ead4f..e5447e5f8f58f 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1641,7 +1641,7 @@ def true_and_notnull(x, *args, **kwargs): if len(indexers) == 0: filtered = self.obj.take([]) # because np.concatenate would fail else: - filtered = self.obj.take(np.concatenate(indexers)) + filtered = self.obj.take(np.sort(np.concatenate(indexers))) if dropna: return filtered else: @@ -2166,7 +2166,7 @@ def add_indexer(): if len(indexers) == 0: filtered = self.obj.take([]) # because np.concatenate would fail else: - filtered = self.obj.take(np.concatenate(indexers)) + filtered = self.obj.take(np.sort(np.concatenate(indexers))) if dropna: return filtered else: diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 8f1bc91f7b46e..29f64090ddb11 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2622,14 +2622,12 @@ def test_filter_out_no_groups(self): grouper = s.apply(lambda x: x % 2) grouped = s.groupby(grouper) filtered = grouped.filter(lambda x: x.mean() > 0) - filtered.sort() # was sorted by group - s.sort() # was sorted arbitrarily assert_series_equal(filtered, s) df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) grouper = df['A'].apply(lambda x: x % 2) grouped = df.groupby(grouper) filtered = grouped.filter(lambda x: x['A'].mean() > 0) - assert_frame_equal(filtered.sort(), df) + assert_frame_equal(filtered, df) def test_filter_condition_raises(self): import pandas as pd @@ -2706,7 +2704,7 @@ def test_filter_against_workaround(self): old_way = df[grouped.floats.\ transform(lambda x: x.mean() > N/20).astype('bool')] new_way = grouped.filter(lambda x: x['floats'].mean() > N/20) - assert_frame_equal(new_way.sort(), old_way.sort()) + assert_frame_equal(new_way, old_way) # Group by floats (rounded); filter on strings. grouper = df.floats.apply(lambda x: np.round(x, -1)) @@ -2715,14 +2713,14 @@ def test_filter_against_workaround(self): transform(lambda x: len(x) < N/10).astype('bool')] new_way = grouped.filter( lambda x: len(x.letters) < N/10) - assert_frame_equal(new_way.sort(), old_way.sort()) + assert_frame_equal(new_way, old_way) # Group by strings; filter on ints. grouped = df.groupby('letters') old_way = df[grouped.ints.\ transform(lambda x: x.mean() > N/20).astype('bool')] new_way = grouped.filter(lambda x: x['ints'].mean() > N/20) - assert_frame_equal(new_way.sort_index(), old_way.sort_index()) + assert_frame_equal(new_way, old_way) def test_filter_using_len(self): # BUG GH4447 @@ -2747,6 +2745,48 @@ def test_filter_using_len(self): expected = s[[]] assert_series_equal(actual, expected) + def test_filter_maintains_ordering(self): + # Simple case: index is sequential. #4621 + df = DataFrame({'pid' : [1,1,1,2,2,3,3,3], + 'tag' : [23,45,62,24,45,34,25,62]}) + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + assert_series_equal(actual, expected) + + # Now index is sequentially decreasing. + df.index = np.arange(len(df) - 1, -1, -1) + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + assert_series_equal(actual, expected) + + # Index is shuffled. + SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] + df.index = df.index[SHUFFLED] + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + assert_series_equal(actual, expected) + def test_groupby_whitelist(self): from string import ascii_lowercase letters = np.array(list(ascii_lowercase))