From e5923d00b36c942a9706b09ebaf786ce14d94c0c Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 17 Aug 2014 18:01:52 -0400 Subject: [PATCH] BUG: Bug in DataFrameGroupby.transform when transforming with a passed non-sorted key (GH8046) --- doc/source/v0.15.0.txt | 2 +- pandas/core/groupby.py | 35 ++++++++++++++++++++--------------- pandas/tests/test_groupby.py | 22 +++++++++++++++++++++- 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 2e3841e8a00c3..0a857adbe84e8 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -465,7 +465,7 @@ Bug Fixes when matching block and manager items, when there's only one block there's no ambiguity (:issue:`7794`) - Bug in HDFStore iteration when passing a where (:issue:`8014`) - +- Bug in DataFrameGroupby.transform when transforming with a passed non-sorted key (:issue:`8046`) - Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ce57a9c03d570..eaaf85a1f5f84 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -475,6 +475,24 @@ def _set_selection_from_grouper(self): if len(groupers): self._group_selection = (ax-Index(groupers)).tolist() + def _set_result_index_ordered(self, result): + # set the result index on the passed values object + # return the new object + # related 8046 + + # the values/counts are repeated according to the group index + indices = self.indices + + # shortcut of we have an already ordered grouper + + if not Index(self.grouper.group_info[0]).is_monotonic: + index = Index(np.concatenate([ indices[v] for v in self.grouper.result_index ])) + result.index = index + result = result.sort_index() + + result.index = self.obj.index + return result + def _local_dir(self): return sorted(set(self.obj._local_dir() + list(self._apply_whitelist))) @@ -2087,7 +2105,6 @@ def _convert_grouper(axis, grouper): else: return grouper - class SeriesGroupBy(GroupBy): _apply_whitelist = _series_apply_whitelist @@ -2319,18 +2336,7 @@ def _transform_fast(self, func): counts = self.count().values values = np.repeat(values, com._ensure_platform_int(counts)) - # the values/counts are repeated according to the group index - indices = self.indices - - # shortcut of we have an already ordered grouper - if Index(self.grouper.group_info[0]).is_monotonic: - result = Series(values, index=self.obj.index) - else: - index = Index(np.concatenate([ indices[v] for v in self.grouper.result_index ])) - result = Series(values, index=index).sort_index() - result.index = self.obj.index - - return result + return self._set_result_index_ordered(Series(values)) def filter(self, func, dropna=True, *args, **kwargs): """ @@ -2842,8 +2848,7 @@ def _transform_general(self, func, *args, **kwargs): concat_index = obj.columns if self.axis == 0 else obj.index concatenated = concat(applied, join_axes=[concat_index], axis=self.axis, verify_integrity=False) - concatenated.sort_index(inplace=True) - return concatenated + return self._set_result_index_ordered(concatenated) def transform(self, func, *args, **kwargs): """ diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 6f39750de9d9b..5d087a1ae0810 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -796,6 +796,26 @@ def test_transform(self): transformed = grouped.transform(lambda x: x * x.sum()) self.assertEqual(transformed[7], 12) + # GH 8046 + # make sure that we preserve the input order + + df = DataFrame(np.arange(6,dtype='int64').reshape(3,2), columns=["a","b"], index=[0,2,1]) + key = [0,0,1] + expected = df.sort_index().groupby(key).transform(lambda x: x-x.mean()).groupby(key).mean() + result = df.groupby(key).transform(lambda x: x-x.mean()).groupby(key).mean() + assert_frame_equal(result, expected) + + def demean(arr): + return arr - arr.mean() + + people = DataFrame(np.random.randn(5, 5), + columns=['a', 'b', 'c', 'd', 'e'], + index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']) + key = ['one', 'two', 'one', 'two', 'one'] + result = people.groupby(key).transform(demean).groupby(key).mean() + expected = people.groupby(key).apply(demean).groupby(key).mean() + assert_frame_equal(result, expected) + def test_transform_fast(self): df = DataFrame( { 'id' : np.arange( 100000 ) / 3, @@ -2924,7 +2944,7 @@ def __call__(self, x): lambda x: sum(x), lambda x: x.sum(), partial(sum), fn_class()] - + expected = df.groupby("foo").agg(sum) for ecall in equiv_callables: result = df.groupby('foo').agg(ecall)