Merge pull request #9994 from evanpw/issue_9921

jreback · jreback · commit c9d1ef978811 · 2015-04-29T06:31:53.000-04:00
BUG: transform and filter misbehave when grouping on categorical data
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -252,3 +252,5 @@ Bug Fixes
 
 
 - Bug in hiding ticklabels with subplots and shared axes when adding a new plot to an existing grid of axes (:issue:`9158`)
+- Bug in ``transform`` and ``filter`` when grouping on a categorical variable (:issue:`9921`)
+- Bug in ``transform`` when groups are equal in number and dtype to the input index (:issue:`9700`)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -25,8 +25,8 @@
                                notnull, _DATELIKE_DTYPES, is_numeric_dtype,
                                is_timedelta64_dtype, is_datetime64_dtype,
                                is_categorical_dtype, _values_from_object,
-                               is_datetime_or_timedelta_dtype, is_bool_dtype,
-                               AbstractMethodError)
+                               is_datetime_or_timedelta_dtype, is_bool,
+                               is_bool_dtype, AbstractMethodError)
 from pandas.core.config import option_context
 import pandas.lib as lib
 from pandas.lib import Timestamp
@@ -491,7 +491,7 @@ def _set_result_index_ordered(self, result):
 
         # shortcut of we have an already ordered grouper
         if not self.grouper.is_monotonic:
-            index = Index(np.concatenate([ indices[v] for v in self.grouper.result_index ]))
+            index = Index(np.concatenate([ indices.get(v, []) for v in self.grouper.result_index]))
             result.index = index
             result = result.sort_index()
 
@@ -2436,6 +2436,8 @@ def transform(self, func, *args, **kwargs):
 
         wrapper = lambda x: func(x, *args, **kwargs)
         for i, (name, group) in enumerate(self):
+            if name not in self.indices:
+                continue
 
             object.__setattr__(group, 'name', name)
             res = wrapper(group)
@@ -2451,7 +2453,7 @@ def transform(self, func, *args, **kwargs):
             except:
                 pass
 
-            indexer = self._get_index(name)
+            indexer = self.indices[name]
             result[indexer] = res
 
         result = _possibly_downcast_to_dtype(result, dtype)
@@ -2465,9 +2467,12 @@ def _transform_fast(self, func):
         """
         if isinstance(func, compat.string_types):
             func = getattr(self,func)
+
         values = func().values
-        counts = self.size().values
+        counts = self.size().fillna(0).values
         values = np.repeat(values, com._ensure_platform_int(counts))
+        if any(counts == 0):
+            values = self._try_cast(values, self._selected_obj)
 
         return self._set_result_index_ordered(Series(values))
 
@@ -2502,8 +2507,11 @@ def true_and_notnull(x, *args, **kwargs):
             return b and notnull(b)
 
         try:
-            indices = [self._get_index(name) if true_and_notnull(group) else []
-                       for name, group in self]
+            indices = []
+            for name, group in self:
+                if true_and_notnull(group) and name in self.indices:
+                    indices.append(self.indices[name])
+
         except ValueError:
             raise TypeError("the filter must return a boolean result")
         except TypeError:
@@ -3020,24 +3028,18 @@ def transform(self, func, *args, **kwargs):
         if not result.columns.equals(obj.columns):
             return self._transform_general(func, *args, **kwargs)
 
-        # a grouped that doesn't preserve the index, remap index based on the grouper
-        # and broadcast it
-        if ((not isinstance(obj.index,MultiIndex) and
-             type(result.index) != type(obj.index)) or
-            len(result.index) != len(obj.index)):
-            results = np.empty_like(obj.values, result.values.dtype)
-            indices = self.indices
-            for (name, group), (i, row) in zip(self, result.iterrows()):
+        results = np.empty_like(obj.values, result.values.dtype)
+        indices = self.indices
+        for (name, group), (i, row) in zip(self, result.iterrows()):
+            if name in indices:
                 indexer = indices[name]
                 results[indexer] = np.tile(row.values,len(indexer)).reshape(len(indexer),-1)
-            return DataFrame(results,columns=result.columns,index=obj.index).convert_objects()
 
-        # we can merge the result in
-        # GH 7383
-        names = result.columns
-        result = obj.merge(result, how='outer', left_index=True, right_index=True).iloc[:,-result.shape[1]:]
-        result.columns = names
-        return result
+        counts = self.size().fillna(0).values
+        if any(counts == 0):
+            results = self._try_cast(results, obj[result.columns])
+
+        return DataFrame(results,columns=result.columns,index=obj.index).convert_objects()
 
     def _define_paths(self, func, *args, **kwargs):
         if isinstance(func, compat.string_types):
@@ -3129,10 +3131,9 @@ def filter(self, func, dropna=True, *args, **kwargs):
                 pass
 
             # interpret the result of the filter
-            if (isinstance(res, (bool, np.bool_)) or
-                np.isscalar(res) and isnull(res)):
-                if res and notnull(res):
-                    indices.append(self._get_index(name))
+            if is_bool(res) or (lib.isscalar(res) and isnull(res)):
+                if res and notnull(res) and name in self.indices:
+                    indices.append(self.indices[name])
             else:
                 # non scalars aren't allowed
                 raise TypeError("filter function returned a %s, "
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -1820,6 +1820,27 @@ def f(x):
         expected['person_name'] = expected['person_name'].astype('object')
         tm.assert_frame_equal(result, expected)
 
+        # GH 9921
+        # Monotonic
+        df = DataFrame({"a": [5, 15, 25]})
+        c = pd.cut(df.a, bins=[0,10,20,30,40])
+        tm.assert_series_equal(df.a.groupby(c).transform(sum), df['a'])
+        tm.assert_series_equal(df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a'])
+        tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']])
+        tm.assert_frame_equal(df.groupby(c).transform(lambda xs: np.max(xs)), df[['a']])
+
+        # Filter
+        tm.assert_series_equal(df.a.groupby(c).filter(np.all), df['a'])
+        tm.assert_frame_equal(df.groupby(c).filter(np.all), df)
+
+        # Non-monotonic
+        df = DataFrame({"a": [5, 15, 25, -5]})
+        c = pd.cut(df.a, bins=[-10, 0,10,20,30,40])
+        tm.assert_series_equal(df.a.groupby(c).transform(sum), df['a'])
+        tm.assert_series_equal(df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a'])
+        tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']])
+        tm.assert_frame_equal(df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']])
+
     def test_pivot_table(self):
 
         raw_cat1 = Categorical(["a","a","b","b"], categories=["a","b","z"], ordered=True)
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -960,6 +960,12 @@ def demean(arr):
         g = df.groupby(pd.TimeGrouper('M'))
         g.transform(lambda x: x-1)
 
+        # GH 9700
+        df = DataFrame({'a' : range(5, 10), 'b' : range(5)})
+        result = df.groupby('a').transform(max)
+        expected = DataFrame({'b' : range(5)})
+        tm.assert_frame_equal(result, expected)
+
     def test_transform_fast(self):
 
         df = DataFrame( { 'id' : np.arange( 100000 ) / 3,

Original file line number	Diff line number	Diff line change
`@@ -252,3 +252,5 @@ Bug Fixes`
`252`	`252`
`253`	`253`
`254`	`254`	- Bug in hiding ticklabels with subplots and shared axes when adding a new plot to an existing grid of axes (:issue:`9158`)
	`255`	+- Bug in ``transform`` and ``filter`` when grouping on a categorical variable (:issue:`9921`)
	`256`	+- Bug in ``transform`` when groups are equal in number and dtype to the input index (:issue:`9700`)