Merge pull request #6569 from hayd/groupby_nth

hayd · hayd · commit 6e758b7262e9 · 2014-03-07T12:24:22.000-08:00
ENH/BUG groupby nth now filters, works with DataFrames
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -738,6 +738,34 @@ This shows the first or last n rows from each group.
         1 0  1  2
         5 2  5  6
 
+Taking the nth row of each group
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To select from a DataFrame or Series the nth item, use the nth method:
+
+.. ipython:: python
+
+   DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
+   g = df.groupby('A')
+   g.nth(0)
+
+   g.nth(1)
+
+   g.nth(-1)
+
+If you want to select the nth not-null method, use the dropna kwarg. For a DataFrame this should be either 'any' or 'all' just like you would pass to dropna, for a Series this just needs to be truthy. 
+
+.. ipython:: python
+
+   g.nth(0, dropna='any')
+
+   g.nth(1, dropna='any')  # NaNs denote group exhausted when using dropna
+
+   g.B.nth(0, dropna=True)
+
+.. warning::
+
+   Before 0.14.0 this method existed but did not work correctly on DataFrames. The API has changed so that it filters by default, but the old behaviour (for Series) can be achieved by passing dropna. An alternative is to dropna before doing the groupby.
 
 Enumerate group items
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -62,7 +62,7 @@ These are out-of-bounds selections
      s.index.year
 
 - More consistent behaviour for some groupby methods:
-   - groupby head and tail now act more like filter rather than an aggregation:
+   - groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation:
 
   .. ipython:: python
 
@@ -78,6 +78,16 @@ These are out-of-bounds selections
 
      g[['B']].head(1)
 
+   - groupby ``nth`` now filters by default, with optional dropna argument to ignore
+     NaN (to replicate the previous behaviour.)
+
+  .. ipython:: python
+
+     DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
+     g = df.groupby('A')
+     g.nth(0)  # can also use negative ints
+
+     g.nth(0, dropna='any')  # similar to old behaviour
 
 - Local variable usage has changed in
   :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query`
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -523,15 +523,75 @@ def ohlc(self):
         """
         return self._cython_agg_general('ohlc')
 
-    def nth(self, n):
-        def picker(arr):
-            arr = arr[notnull(arr)]
-            if len(arr) >= n + 1:
-                return arr.iget(n)
+    def nth(self, n, dropna=None):
+        """
+        Take the nth row from each group.
+
+        If dropna, will not show nth non-null row, dropna is either
+        Truthy (if a Series) or 'all', 'any' (if a DataFrame); this is equivalent
+        to calling dropna(how=dropna) before the groupby.
+
+        Examples
+        --------
+        >>> DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
+        >>> g = df.groupby('A')
+        >>> g.nth(0)
+           A   B
+        0  1 NaN
+        2  5   6
+        >>> g.nth(1)
+           A  B
+        1  1  4
+        >>> g.nth(-1)
+           A  B
+        1  1  4
+        2  5  6
+        >>> g.nth(0, dropna='any')
+           B
+        A
+        1  4
+        5  6
+        >>> g.nth(1, dropna='any')  # NaNs denote group exhausted when using dropna
+            B
+        A
+        1 NaN
+        5 NaN
+
+        """
+
+        if not dropna:  # good choice
+            m = self.grouper._max_groupsize
+            if n >= m or n < -m:
+                return self._selected_obj.loc[[]]
+            rng = np.zeros(m, dtype=bool)
+            if n >= 0:
+                rng[n] = True
+                is_nth = self._cumcount_array(rng)
             else:
+                rng[- n - 1] = True
+                is_nth = self._cumcount_array(rng, ascending=False)
+            return self._selected_obj[is_nth]
+
+        if (isinstance(self._selected_obj, DataFrame)
+            and dropna not in ['any', 'all']):
+            # Note: when agg-ing picker doesn't raise this, just returns NaN
+            raise ValueError("For a DataFrame groupby, dropna must be "
+                             "either None, 'any' or 'all', "
+                             "(was passed %s)." % (dropna),)
+
+        # old behaviour, but with all and any support for DataFrames.
+
+        max_len = n if n >= 0 else - 1 - n
+        def picker(x):
+            x = x.dropna(how=dropna)  # Note: how is ignored if Series
+            if len(x) <= max_len:
                 return np.nan
+            else:
+                return x.iloc[n]
+
         return self.agg(picker)
 
+
     def cumcount(self, **kwargs):
         """
         Number each item in each group from 0 to the length of that group - 1.
@@ -579,8 +639,7 @@ def cumcount(self, **kwargs):
         ascending = kwargs.pop('ascending', True)
 
         index = self.obj.index
-        rng = np.arange(self.grouper._max_groupsize, dtype='int64')
-        cumcounts = self._cumcount_array(rng, ascending=ascending)
+        cumcounts = self._cumcount_array(ascending=ascending)
         return Series(cumcounts, index)
 
     def head(self, n=5):
@@ -606,8 +665,7 @@ def head(self, n=5):
 
         """
         obj = self._selected_obj
-        rng = np.arange(self.grouper._max_groupsize, dtype='int64')
-        in_head = self._cumcount_array(rng) < n
+        in_head = self._cumcount_array() < n
         head = obj[in_head]
         return head
 
@@ -639,11 +697,17 @@ def tail(self, n=5):
         tail = obj[in_tail]
         return tail
 
-    def _cumcount_array(self, arr, **kwargs):
+    def _cumcount_array(self, arr=None, **kwargs):
+        """
+        arr is where cumcount gets it's values from
+        """
         ascending = kwargs.pop('ascending', True)
 
+        if arr is None:
+            arr = np.arange(self.grouper._max_groupsize, dtype='int64')
+
         len_index = len(self.obj.index)
-        cumcounts = np.zeros(len_index, dtype='int64')
+        cumcounts = np.empty(len_index, dtype=arr.dtype)
         if ascending:
             for v in self.indices.values():
                 cumcounts[v] = arr[:len(v)]
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -156,19 +156,18 @@ def test_first_last_nth(self):
         assert_frame_equal(last, expected, check_names=False)
 
         nth = grouped.nth(1)
-        expected = self.df.ix[[3, 2], ['B', 'C', 'D']]
-        expected.index = ['bar', 'foo']
+        expected = self.df.iloc[[2, 3]]
         assert_frame_equal(nth, expected, check_names=False)
 
         # it works!
         grouped['B'].first()
         grouped['B'].last()
         grouped['B'].nth(0)
 
-        self.df['B'][self.df['A'] == 'foo'] = np.nan
+        self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan
         self.assert_(com.isnull(grouped['B'].first()['foo']))
         self.assert_(com.isnull(grouped['B'].last()['foo']))
-        self.assert_(com.isnull(grouped['B'].nth(0)['foo']))
+        self.assert_(com.isnull(grouped['B'].nth(0)[0]))  # not sure what this is testing
 
     def test_first_last_nth_dtypes(self):
 
@@ -189,8 +188,7 @@ def test_first_last_nth_dtypes(self):
         assert_frame_equal(last, expected, check_names=False)
 
         nth = grouped.nth(1)
-        expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']]
-        expected.index = ['bar', 'foo']
+        expected = df.iloc[[2, 3]]
         assert_frame_equal(nth, expected, check_names=False)
 
         # GH 2763, first/last shifting dtypes
@@ -201,6 +199,29 @@ def test_first_last_nth_dtypes(self):
         f = s.groupby(level=0).first()
         self.assertEqual(f.dtype, 'int64')
 
+    def test_nth(self):
+        df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
+        g = df.groupby('A')
+
+        assert_frame_equal(g.nth(0), df.iloc[[0, 2]])
+        assert_frame_equal(g.nth(1), df.iloc[[1]])
+        assert_frame_equal(g.nth(2), df.loc[[]])
+        assert_frame_equal(g.nth(-1), df.iloc[[1, 2]])
+        assert_frame_equal(g.nth(-2), df.iloc[[0]])
+        assert_frame_equal(g.nth(-3), df.loc[[]])
+        assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]])
+        assert_series_equal(g.B.nth(1), df.B.iloc[[1]])
+        assert_frame_equal(g[['B']].nth(0), df.ix[[0, 2], ['B']])
+
+        exp = df.set_index('A')
+        assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
+        assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])
+
+        exp['B'] = np.nan
+        assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
+        assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])
+
+
     def test_grouper_index_types(self):
         # related GH5375
         # groupby misbehaving when using a Floatlike index
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
@@ -269,6 +269,22 @@ def f(g):
 groupby_frame_apply = Benchmark("df.groupby(['key', 'key2']).apply(f)", setup,
                                  start_date=datetime(2011, 10, 1))
 
+
+#----------------------------------------------------------------------
+# DataFrame nth
+
+setup = common_setup + """
+df = pd.DataFrame(np.random.randint(1, 100, (10000, 2)))
+"""
+
+# Not really a fair test as behaviour has changed!
+groupby_frame_nth = Benchmark("df.groupby(0).nth(0)", setup,
+                                start_date=datetime(2014, 3, 1))
+
+groupby_series_nth = Benchmark("df[1].groupby(df[0]).nth(0)", setup,
+                                 start_date=datetime(2014, 3, 1))
+
+
 #----------------------------------------------------------------------
 # Sum booleans #2692