Skip to content

Commit 61d7e14

Browse files
committed
Merge pull request #5222 from danielballan/filter-maintains-ordering
BUG: Groupby filter maintains ordering, closes #4621
2 parents d686154 + 62b7816 commit 61d7e14

File tree

3 files changed

+50
-8
lines changed

3 files changed

+50
-8
lines changed

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,8 @@ Bug Fixes
600600
- Fixed bug where inplace setting of levels or labels on ``MultiIndex`` would
601601
not clear cached ``values`` property and therefore return wrong ``values``.
602602
(:issue:`5215`)
603+
- Fixed bug where filtering a grouped DataFrame or Series did not maintain
604+
the original ordering (:issue:`4621`).
603605

604606
pandas 0.12.0
605607
-------------

pandas/core/groupby.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1641,7 +1641,7 @@ def true_and_notnull(x, *args, **kwargs):
16411641
if len(indexers) == 0:
16421642
filtered = self.obj.take([]) # because np.concatenate would fail
16431643
else:
1644-
filtered = self.obj.take(np.concatenate(indexers))
1644+
filtered = self.obj.take(np.sort(np.concatenate(indexers)))
16451645
if dropna:
16461646
return filtered
16471647
else:
@@ -2166,7 +2166,7 @@ def add_indexer():
21662166
if len(indexers) == 0:
21672167
filtered = self.obj.take([]) # because np.concatenate would fail
21682168
else:
2169-
filtered = self.obj.take(np.concatenate(indexers))
2169+
filtered = self.obj.take(np.sort(np.concatenate(indexers)))
21702170
if dropna:
21712171
return filtered
21722172
else:

pandas/tests/test_groupby.py

+46-6
Original file line numberDiff line numberDiff line change
@@ -2622,14 +2622,12 @@ def test_filter_out_no_groups(self):
26222622
grouper = s.apply(lambda x: x % 2)
26232623
grouped = s.groupby(grouper)
26242624
filtered = grouped.filter(lambda x: x.mean() > 0)
2625-
filtered.sort() # was sorted by group
2626-
s.sort() # was sorted arbitrarily
26272625
assert_series_equal(filtered, s)
26282626
df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()})
26292627
grouper = df['A'].apply(lambda x: x % 2)
26302628
grouped = df.groupby(grouper)
26312629
filtered = grouped.filter(lambda x: x['A'].mean() > 0)
2632-
assert_frame_equal(filtered.sort(), df)
2630+
assert_frame_equal(filtered, df)
26332631

26342632
def test_filter_condition_raises(self):
26352633
import pandas as pd
@@ -2706,7 +2704,7 @@ def test_filter_against_workaround(self):
27062704
old_way = df[grouped.floats.\
27072705
transform(lambda x: x.mean() > N/20).astype('bool')]
27082706
new_way = grouped.filter(lambda x: x['floats'].mean() > N/20)
2709-
assert_frame_equal(new_way.sort(), old_way.sort())
2707+
assert_frame_equal(new_way, old_way)
27102708

27112709
# Group by floats (rounded); filter on strings.
27122710
grouper = df.floats.apply(lambda x: np.round(x, -1))
@@ -2715,14 +2713,14 @@ def test_filter_against_workaround(self):
27152713
transform(lambda x: len(x) < N/10).astype('bool')]
27162714
new_way = grouped.filter(
27172715
lambda x: len(x.letters) < N/10)
2718-
assert_frame_equal(new_way.sort(), old_way.sort())
2716+
assert_frame_equal(new_way, old_way)
27192717

27202718
# Group by strings; filter on ints.
27212719
grouped = df.groupby('letters')
27222720
old_way = df[grouped.ints.\
27232721
transform(lambda x: x.mean() > N/20).astype('bool')]
27242722
new_way = grouped.filter(lambda x: x['ints'].mean() > N/20)
2725-
assert_frame_equal(new_way.sort_index(), old_way.sort_index())
2723+
assert_frame_equal(new_way, old_way)
27262724

27272725
def test_filter_using_len(self):
27282726
# BUG GH4447
@@ -2747,6 +2745,48 @@ def test_filter_using_len(self):
27472745
expected = s[[]]
27482746
assert_series_equal(actual, expected)
27492747

2748+
def test_filter_maintains_ordering(self):
2749+
# Simple case: index is sequential. #4621
2750+
df = DataFrame({'pid' : [1,1,1,2,2,3,3,3],
2751+
'tag' : [23,45,62,24,45,34,25,62]})
2752+
s = df['pid']
2753+
grouped = df.groupby('tag')
2754+
actual = grouped.filter(lambda x: len(x) > 1)
2755+
expected = df.iloc[[1, 2, 4, 7]]
2756+
assert_frame_equal(actual, expected)
2757+
2758+
grouped = s.groupby(df['tag'])
2759+
actual = grouped.filter(lambda x: len(x) > 1)
2760+
expected = s.iloc[[1, 2, 4, 7]]
2761+
assert_series_equal(actual, expected)
2762+
2763+
# Now index is sequentially decreasing.
2764+
df.index = np.arange(len(df) - 1, -1, -1)
2765+
s = df['pid']
2766+
grouped = df.groupby('tag')
2767+
actual = grouped.filter(lambda x: len(x) > 1)
2768+
expected = df.iloc[[1, 2, 4, 7]]
2769+
assert_frame_equal(actual, expected)
2770+
2771+
grouped = s.groupby(df['tag'])
2772+
actual = grouped.filter(lambda x: len(x) > 1)
2773+
expected = s.iloc[[1, 2, 4, 7]]
2774+
assert_series_equal(actual, expected)
2775+
2776+
# Index is shuffled.
2777+
SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
2778+
df.index = df.index[SHUFFLED]
2779+
s = df['pid']
2780+
grouped = df.groupby('tag')
2781+
actual = grouped.filter(lambda x: len(x) > 1)
2782+
expected = df.iloc[[1, 2, 4, 7]]
2783+
assert_frame_equal(actual, expected)
2784+
2785+
grouped = s.groupby(df['tag'])
2786+
actual = grouped.filter(lambda x: len(x) > 1)
2787+
expected = s.iloc[[1, 2, 4, 7]]
2788+
assert_series_equal(actual, expected)
2789+
27502790
def test_groupby_whitelist(self):
27512791
from string import ascii_lowercase
27522792
letters = np.array(list(ascii_lowercase))

0 commit comments

Comments
 (0)