BUG: Bug in DataFrameGroupby.transform when transforming with a passed non-sorted key (GH8046)

jreback · jreback · commit e5923d00b36c · 2014-08-18T09:06:43.000-04:00
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -465,7 +465,7 @@ Bug Fixes
   when matching block and manager items, when there's only one block there's no ambiguity (:issue:`7794`)
 
 - Bug in HDFStore iteration when passing a where (:issue:`8014`)
-
+- Bug in DataFrameGroupby.transform when transforming with a passed non-sorted key (:issue:`8046`)
 - Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`)
 
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -475,6 +475,24 @@ def _set_selection_from_grouper(self):
             if len(groupers):
                 self._group_selection = (ax-Index(groupers)).tolist()
 
+    def _set_result_index_ordered(self, result):
+        # set the result index on the passed values object
+        # return the new object
+        # related 8046
+
+        # the values/counts are repeated according to the group index
+        indices = self.indices
+
+        # shortcut of we have an already ordered grouper
+
+        if not Index(self.grouper.group_info[0]).is_monotonic:
+            index = Index(np.concatenate([ indices[v] for v in self.grouper.result_index ]))
+            result.index = index
+            result = result.sort_index()
+
+        result.index = self.obj.index
+        return result
+
     def _local_dir(self):
         return sorted(set(self.obj._local_dir() + list(self._apply_whitelist)))
 
@@ -2087,7 +2105,6 @@ def _convert_grouper(axis, grouper):
     else:
         return grouper
 
-
 class SeriesGroupBy(GroupBy):
     _apply_whitelist = _series_apply_whitelist
 
@@ -2319,18 +2336,7 @@ def _transform_fast(self, func):
         counts = self.count().values
         values = np.repeat(values, com._ensure_platform_int(counts))
 
-        # the values/counts are repeated according to the group index
-        indices = self.indices
-
-        # shortcut of we have an already ordered grouper
-        if Index(self.grouper.group_info[0]).is_monotonic:
-            result = Series(values, index=self.obj.index)
-        else:
-            index = Index(np.concatenate([ indices[v] for v in self.grouper.result_index ]))
-            result = Series(values, index=index).sort_index()
-            result.index = self.obj.index
-
-        return result
+        return self._set_result_index_ordered(Series(values))
 
     def filter(self, func, dropna=True, *args, **kwargs):
         """
@@ -2842,8 +2848,7 @@ def _transform_general(self, func, *args, **kwargs):
         concat_index = obj.columns if self.axis == 0 else obj.index
         concatenated = concat(applied, join_axes=[concat_index],
                               axis=self.axis, verify_integrity=False)
-        concatenated.sort_index(inplace=True)
-        return concatenated
+        return self._set_result_index_ordered(concatenated)
 
     def transform(self, func, *args, **kwargs):
         """
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -796,6 +796,26 @@ def test_transform(self):
         transformed = grouped.transform(lambda x: x * x.sum())
         self.assertEqual(transformed[7], 12)
 
+        # GH 8046
+        # make sure that we preserve the input order
+
+        df = DataFrame(np.arange(6,dtype='int64').reshape(3,2), columns=["a","b"], index=[0,2,1])
+        key = [0,0,1]
+        expected = df.sort_index().groupby(key).transform(lambda x: x-x.mean()).groupby(key).mean()
+        result = df.groupby(key).transform(lambda x: x-x.mean()).groupby(key).mean()
+        assert_frame_equal(result, expected)
+
+        def demean(arr):
+            return arr - arr.mean()
+
+        people = DataFrame(np.random.randn(5, 5),
+                           columns=['a', 'b', 'c', 'd', 'e'],
+                           index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
+        key = ['one', 'two', 'one', 'two', 'one']
+        result = people.groupby(key).transform(demean).groupby(key).mean()
+        expected = people.groupby(key).apply(demean).groupby(key).mean()
+        assert_frame_equal(result, expected)
+
     def test_transform_fast(self):
 
         df = DataFrame( { 'id' : np.arange( 100000 ) / 3,
@@ -2924,7 +2944,7 @@ def __call__(self, x):
                            lambda x: sum(x),
                            lambda x: x.sum(),
                            partial(sum), fn_class()]
-        
+
         expected = df.groupby("foo").agg(sum)
         for ecall in equiv_callables:
             result = df.groupby('foo').agg(ecall)