ENH: Add filter method to SeriesGroupBy, DataFrameGroupBy

danielballan · danielballan · commit 2a2cfb83582e · 2013-06-06T16:14:48.000-04:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -48,6 +48,8 @@ pandas 0.11.1
   - Add iterator to ``Series.str`` (GH3638_)
   - ``pd.set_option()`` now allows N option, value pairs (GH3667_).
   - Added keyword parameters for different types of scatter_matrix subplots
+  - A ``filter`` method on grouped Series or DataFrames returns a subset of
+    the original (GH3680_, GH919_)
 
 **Improvements to existing features**
 
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -41,6 +41,12 @@ following:
     - Standardizing data (zscore) within group
     - Filling NAs within groups with a value derived from each group
 
+ - **Filtration**: discard some groups, according to a group-wise computation
+   that evaluates True or False. Some examples:
+
+    - Discarding data that belongs to groups with only a few members
+    - Filtering out data based on the group sum or mean
+
  - Some combination of the above: GroupBy will examine the results of the apply
    step and try to return a sensibly combined result if it doesn't fit into
    either of the above two categories
@@ -489,6 +495,39 @@ and that the transformed data contains no NAs.
    grouped_trans.count() # counts after transformation
    grouped_trans.size() # Verify non-NA count equals group size
 
+.. _groupby.filter:
+
+Filtration
+----------
+
+The ``filter`` method returns a subset of the original object. Suppose we
+want to take only elements that belong to groups with a group sum greater
+than 2.
+
+.. ipython:: python
+
+   s = Series([1, 1, 2, 3, 3, 3])
+   s.groupby(s).filter(lambda x: x.sum() > 2)
+
+The argument of ``filter`` must a function that, applied to the group as a 
+whole, returns ``True`` or ``False``.
+
+Another useful operation is filtering out elements that belong to groups
+with only a couple members.
+
+.. ipython:: python
+
+   df = DataFrame({'A': arange(8), 'B': list('aabbbbcc')})
+   df.groupby('B').filter(lambda x: len(x) > 2)
+
+Alternatively, instead of dropping the offending groups, we can return a
+like-indexed objects where the groups that do not pass the filter are filled
+with NaNs.
+
+.. ipython:: python
+
+   df.groupby('B').filter(lambda x: len(x) > 2, dropna=False)
+
 .. _groupby.dispatch:
 
 Dispatching to instance methods
diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt
@@ -237,6 +237,35 @@ Enhancements
          pd.get_option('a.b')
          pd.get_option('b.c')
 
+  - The ``filter`` method for group objects returns a subset of the original 
+    object. Suppose we want to take only elements that belong to groups with a 
+    group sum greater than 2.
+
+    .. ipython:: python
+
+       s = Series([1, 1, 2, 3, 3, 3])
+       s.groupby(s).filter(lambda x: x.sum() > 2)
+
+    The argument of ``filter`` must a function that, applied to the group as a
+    whole, returns ``True`` or ``False``.
+
+    Another useful operation is filtering out elements that belong to groups
+    with only a couple members.
+
+    .. ipython:: python
+
+       df = DataFrame({'A': arange(8), 'B': list('aabbbbcc')})
+       df.groupby('B').filter(lambda x: len(x) > 2)
+
+    Alternatively, instead of dropping the offending groups, we can return a
+    like-indexed objects where the groups that do not pass the filter are 
+    filled with NaNs.
+
+    .. ipython:: python
+
+       df.groupby('B').filter(lambda x: len(x) > 2, dropna=False)
+
+
 Bug Fixes
 ~~~~~~~~~
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1558,6 +1558,42 @@ def transform(self, func, *args, **kwargs):
         result = _possibly_downcast_to_dtype(result, dtype)
         return self.obj.__class__(result,index=self.obj.index,name=self.obj.name)
 
+    def filter(self, func, dropna=True, *args, **kwargs):
+        """
+        Return a copy of a Series excluding elements from groups that
+        do not satisfy the boolean criterion specified by func.
+
+        Parameters
+        ----------
+        func : function
+            To apply to each group. Should return True or False.
+        dropna : Drop groups that do not pass the filter. True by default;
+            if False, groups that evaluate False are filled with NaNs.
+
+        Example
+        -------
+        >>> grouped.filter(lambda x: x.mean() > 0)
+
+        Returns
+        -------
+        filtered : Series
+        """
+        if isinstance(func, basestring):
+            wrapper = lambda x: getattr(x, func)(*args, **kwargs)
+        else:
+            wrapper = lambda x: func(x, *args, **kwargs)
+
+        indexers = [self.obj.index.get_indexer(group.index) \
+                    if wrapper(group) else [] for _ , group in self]
+
+        if len(indexers) == 0:
+            filtered = self.obj.take([]) # because np.concatenate would fail
+        else:
+            filtered = self.obj.take(np.concatenate(indexers))
+        if dropna:
+            return filtered
+        else:
+            return filtered.reindex(self.obj.index) # Fill with NaNs.
 
 class NDFrameGroupBy(GroupBy):
 
@@ -1928,47 +1964,22 @@ def transform(self, func, *args, **kwargs):
 
         obj = self._obj_with_exclusions
         gen = self.grouper.get_iterator(obj, axis=self.axis)
-
-        if isinstance(func, basestring):
-            fast_path = lambda group: getattr(group, func)(*args, **kwargs)
-            slow_path = lambda group: group.apply(lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis)
-        else:
-            fast_path = lambda group: func(group, *args, **kwargs)
-            slow_path = lambda group: group.apply(lambda x: func(x, *args, **kwargs), axis=self.axis)
+        fast_path, slow_path = self._define_paths(func, *args, **kwargs)
 
         path = None
         for name, group in gen:
             object.__setattr__(group, 'name', name)
 
-            # decide on a fast path
             if path is None:
-
-                path = slow_path
+                # Try slow path and fast path.
                 try:
-                    res  = slow_path(group)
-
-                    # if we make it here, test if we can use the fast path
-                    try:
-                        res_fast = fast_path(group)
-
-                        # compare that we get the same results
-                        if res.shape == res_fast.shape:
-                            res_r = res.values.ravel()
-                            res_fast_r = res_fast.values.ravel()
-                            mask = notnull(res_r)
-                            if (res_r[mask] == res_fast_r[mask]).all():
-                                path = fast_path
-
-                    except:
-                        pass
+                    path, res = self._choose_path(fast_path, slow_path, group)
                 except TypeError:
                     return self._transform_item_by_item(obj, fast_path)
                 except Exception:  # pragma: no cover
                     res  = fast_path(group)
                     path = fast_path
-
             else:
-
                 res = path(group)
 
             # broadcasting
@@ -1988,6 +1999,35 @@ def transform(self, func, *args, **kwargs):
         concatenated.sort_index(inplace=True)
         return concatenated
 
+    def _define_paths(self, func, *args, **kwargs):
+        if isinstance(func, basestring):
+            fast_path = lambda group: getattr(group, func)(*args, **kwargs)
+            slow_path = lambda group: group.apply(lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis)
+        else:
+            fast_path = lambda group: func(group, *args, **kwargs)
+            slow_path = lambda group: group.apply(lambda x: func(x, *args, **kwargs), axis=self.axis)
+        return fast_path, slow_path 
+
+    def _choose_path(self, fast_path, slow_path, group):
+        path = slow_path
+        res  = slow_path(group)
+
+        # if we make it here, test if we can use the fast path
+        try:
+            res_fast = fast_path(group)
+
+            # compare that we get the same results
+            if res.shape == res_fast.shape:
+                res_r = res.values.ravel()
+                res_fast_r = res_fast.values.ravel()
+                mask = notnull(res_r)
+            if (res_r[mask] == res_fast_r[mask]).all():
+                path = fast_path
+
+        except:
+            pass
+        return path, res
+
     def _transform_item_by_item(self, obj, wrapper):
         # iterate through columns
         output = {}
@@ -2008,6 +2048,63 @@ def _transform_item_by_item(self, obj, wrapper):
 
         return DataFrame(output, index=obj.index, columns=columns)
 
+    def filter(self, func, dropna=True, *args, **kwargs):
+        """
+        Return a copy of a DataFrame excluding elements from groups that
+        do not satisfy the boolean criterion specified by func.
+
+        Parameters
+        ----------
+        f : function
+            Function to apply to each subframe. Should return True or False.
+        dropna : Drop groups that do not pass the filter. True by default;
+            if False, groups that evaluate False are filled with NaNs.
+
+        Note
+        ----
+        Each subframe is endowed the attribute 'name' in case you need to know
+        which group you are working on.
+
+        Example
+        --------
+        >>> grouped = df.groupby(lambda x: mapping[x])
+        >>> grouped.filter(lambda x: x['A'].sum() + x['B'].sum() > 0)
+        """
+        from pandas.tools.merge import concat
+
+        indexers = []
+
+        obj = self._obj_with_exclusions
+        gen = self.grouper.get_iterator(obj, axis=self.axis)
+
+        fast_path, slow_path = self._define_paths(func, *args, **kwargs)
+
+        path = None
+        for name, group in gen:
+            object.__setattr__(group, 'name', name)
+
+            if path is None:
+                # Try slow path and fast path.
+                try:
+                    path, res = self._choose_path(fast_path, slow_path, group)
+                except Exception:  # pragma: no cover
+                    res  = fast_path(group)
+                    path = fast_path
+            else:
+                res = path(group)
+
+            if res:
+                indexers.append(self.obj.index.get_indexer(group.index))
+
+        if len(indexers) == 0:
+            filtered = self.obj.take([]) # because np.concatenate would fail
+        else:
+            filtered = self.obj.take(np.concatenate(indexers))
+        if dropna:
+            return filtered
+        else:
+            return filtered.reindex(self.obj.index) # Fill with NaNs.
+
 
 class DataFrameGroupBy(NDFrameGroupBy):
 
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py