BUG: Filter/transform fail in some cases when multi-grouping with a datetime-like key (GH pandas-dev#10114)

Evan Wright · evanpw · commit 441adbcb7955 · 2015-05-13T03:55:20.000-04:00
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -62,6 +62,7 @@ Bug Fixes
 
 - Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`)
 
+- Bug in ``filter`` (regression from 0.16.0) and ``transform`` when grouping on multiple keys, one of which is datetime-like (:issue:`10114`)
 
 - Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`)
 - Bug in ``NaT`` raises ``AttributeError`` when accessing to ``daysinmonth``, ``dayofweek`` properties. (:issue:`10096`)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -413,42 +413,54 @@ def indices(self):
         """ dict {group name -> group indices} """
         return self.grouper.indices
 
-    def _get_index(self, name):
-        """ safe get index, translate keys for datelike to underlying repr """
+    def _get_indices(self, names, raise_on_missing=False):
+        """ safe get multiple indices, translate keys for datelike to underlying repr """
 
-        def convert(key, s):
+        def get_converter(s):
             # possibly convert to they actual key types
             # in the indices, could be a Timestamp or a np.datetime64
-
             if isinstance(s, (Timestamp,datetime.datetime)):
-                return Timestamp(key)
+                return lambda key: Timestamp(key)
             elif isinstance(s, np.datetime64):
-                return Timestamp(key).asm8
-            return key
+                return lambda key: Timestamp(key).asm8
+            else:
+                return lambda key: key
 
-        sample = next(iter(self.indices))
-        if isinstance(sample, tuple):
-            if not isinstance(name, tuple):
+        if len(names) == 0:
+            return []
+
+        index_sample = next(iter(self.indices))
+        name_sample = names[0]
+        if isinstance(index_sample, tuple):
+            if not isinstance(name_sample, tuple):
                 msg = ("must supply a tuple to get_group with multiple"
                        " grouping keys")
                 raise ValueError(msg)
-            if not len(name) == len(sample):
+            if not len(name_sample) == len(index_sample):
                 try:
                     # If the original grouper was a tuple
-                    return self.indices[name]
+                    return [self.indices[name] for name in names]
                 except KeyError:
                     # turns out it wasn't a tuple
                     msg = ("must supply a a same-length tuple to get_group"
                            " with multiple grouping keys")
                     raise ValueError(msg)
 
-            name = tuple([ convert(n, k) for n, k in zip(name,sample) ])
+            converters = [get_converter(s) for s in index_sample]
+            names = [tuple([f(n) for f, n in zip(converters, name)]) for name in names]
 
         else:
+            converter = get_converter(index_sample)
+            names = [converter(name) for name in names]
 
-            name = convert(name, sample)
+        if raise_on_missing:
+            return [self.indices[name] for name in names]
+        else:
+            return [self.indices.get(name, []) for name in names]
 
-        return self.indices[name]
+    def _get_index(self, name, raise_on_missing=False):
+        """ safe get index, translate keys for datelike to underlying repr """
+        return self._get_indices([name], raise_on_missing)[0]
 
     @property
     def name(self):
@@ -494,7 +506,7 @@ def _set_result_index_ordered(self, result):
 
         # shortcut of we have an already ordered grouper
         if not self.grouper.is_monotonic:
-            index = Index(np.concatenate([ indices.get(v, []) for v in self.grouper.result_index]))
+            index = Index(np.concatenate(self._get_indices(self.grouper.result_index)))
             result.index = index
             result = result.sort_index()
 
@@ -598,7 +610,7 @@ def get_group(self, name, obj=None):
         if obj is None:
             obj = self._selected_obj
 
-        inds = self._get_index(name)
+        inds = self._get_index(name, raise_on_missing=True)
         return obj.take(inds, axis=self.axis, convert=False)
 
     def __iter__(self):
@@ -2445,9 +2457,6 @@ def transform(self, func, *args, **kwargs):
 
         wrapper = lambda x: func(x, *args, **kwargs)
         for i, (name, group) in enumerate(self):
-            if name not in self.indices:
-                continue
-
             object.__setattr__(group, 'name', name)
             res = wrapper(group)
 
@@ -2462,7 +2471,7 @@ def transform(self, func, *args, **kwargs):
             except:
                 pass
 
-            indexer = self.indices[name]
+            indexer = self._get_index(name)
             result[indexer] = res
 
         result = _possibly_downcast_to_dtype(result, dtype)
@@ -2516,11 +2525,8 @@ def true_and_notnull(x, *args, **kwargs):
             return b and notnull(b)
 
         try:
-            indices = []
-            for name, group in self:
-                if true_and_notnull(group) and name in self.indices:
-                    indices.append(self.indices[name])
-
+            indices = [self._get_index(name) if true_and_notnull(group)
+                       for name, group in self]
         except ValueError:
             raise TypeError("the filter must return a boolean result")
         except TypeError:
@@ -3040,8 +3046,8 @@ def transform(self, func, *args, **kwargs):
         results = np.empty_like(obj.values, result.values.dtype)
         indices = self.indices
         for (name, group), (i, row) in zip(self, result.iterrows()):
-            if name in indices:
-                indexer = indices[name]
+            indexer = self._get_index(name)
+            if len(indexer) > 0:
                 results[indexer] = np.tile(row.values,len(indexer)).reshape(len(indexer),-1)
 
         counts = self.size().fillna(0).values
@@ -3141,8 +3147,8 @@ def filter(self, func, dropna=True, *args, **kwargs):
 
             # interpret the result of the filter
             if is_bool(res) or (lib.isscalar(res) and isnull(res)):
-                if res and notnull(res) and name in self.indices:
-                    indices.append(self.indices[name])
+                if res and notnull(res):
+                    indices.append(self._get_index(name))
             else:
                 # non scalars aren't allowed
                 raise TypeError("filter function returned a %s, "
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -4335,6 +4335,32 @@ def test_filter_maintains_ordering(self):
         expected = s.iloc[[1, 2, 4, 7]]
         assert_series_equal(actual, expected)
 
+    def test_filter_multiple_timestamp(self):
+        # GH 10114
+        df = DataFrame({'A' : np.arange(5),
+                        'B' : ['foo','bar','foo','bar','bar'],
+                        'C' : Timestamp('20130101') })
+
+        grouped = df.groupby(['B', 'C'])
+
+        result = grouped['A'].filter(lambda x: True)
+        assert_series_equal(df['A'], result)
+
+        result = grouped['A'].transform(len)
+        expected = Series([2, 3, 2, 3, 3], name='A')
+        assert_series_equal(result, expected)
+
+        result = grouped.filter(lambda x: True)
+        assert_frame_equal(df, result)
+
+        result = grouped.transform('sum')
+        expected = DataFrame({'A' : [2, 8, 2, 8, 8]})
+        assert_frame_equal(result, expected)
+
+        result = grouped.transform(len)
+        expected = DataFrame({'A' : [2, 3, 2, 3, 3]})
+        assert_frame_equal(result, expected)
+
     def test_filter_and_transform_with_non_unique_int_index(self):
         # GH4620
         index = [1, 1, 1, 2, 1, 1, 0, 1]