API: update nth to use the _set_selection_from_grouper

jreback · jreback · commit 91e9d817fafe · 2014-05-12T08:01:18.000-04:00
makes first == nth(0) and last = = nth(-1)
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -397,7 +397,7 @@ index are the group names and whose values are the sizes of each group.
    named *columns*.
 
    Aggregating functions are ones that reduce the dimension of the returned objects,
-   for example: ``mean, sum, size, count, std, var, describe, first, last, min, max``. This is
+   for example: ``mean, sum, size, count, std, var, describe, first, last, nth, min, max``. This is
    what happens when you do for example ``DataFrame.sum()`` and get back a ``Series``.
 
 .. _groupby.aggregate.multifunc:
@@ -613,7 +613,7 @@ For dataframes with multiple columns, filters should explicitly specify a column
    a reduced shape of the original (and potentitally eliminating groups), but with the index unchanged.
    Passing ``as_index=False`` will not affect these transformation methods.
 
-   For example: ``head, tail nth``.
+   For example: ``head, tail``.
 
    .. ipython:: python
 
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -190,7 +190,7 @@ API Changes
   validation warnings in :func:`read_csv`/:func:`read_table` (:issue:`6607`)
 - Raise a ``TypeError`` when ``DataFrame`` is passed an iterator as the
   ``data`` argument (:issue:`5357`)
-- groupby will now not return the grouped column for non-cython functions (:issue:`5610`, :issue:`5614`),
+- groupby will now not return the grouped column for non-cython functions (:issue:`5610`, :issue:`5614`, :issue:`6732`),
   as its already the index
 - ``DataFrame.plot`` and ``Series.plot`` now supports area plot with specifying ``kind='area'`` (:issue:`6656`)
 - Line plot can be stacked by ``stacked=True``. (:issue:`6656`)
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -124,7 +124,7 @@ API changes
 
      g.nth(0, dropna='any')  # similar to old behaviour
 
-  groupby will now not return the grouped column for non-cython functions (:issue:`5610`, :issue:`5614`),
+  groupby will now not return the grouped column for non-cython functions (:issue:`5610`, :issue:`5614`, :issue:`6732`),
   as its already the index
 
   .. ipython:: python
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -99,6 +99,7 @@ class SpecificationError(GroupByError):
 def _groupby_function(name, alias, npfunc, numeric_only=True,
                       _convert=False):
     def f(self):
+        self._set_selection_from_grouper()
         try:
             return self._cython_agg_general(alias, numeric_only=numeric_only)
         except AssertionError as e:
@@ -356,6 +357,7 @@ class GroupBy(PandasObject):
     _apply_whitelist = _common_apply_whitelist
     _internal_names = ['_cache']
     _internal_names_set = set(_internal_names)
+    _group_selection = None
 
     def __init__(self, obj, keys=None, axis=0, level=None,
                  grouper=None, exclusions=None, selection=None, as_index=True,
@@ -454,18 +456,20 @@ def _selection_list(self):
     def _selected_obj(self):
 
         if self._selection is None or isinstance(self.obj, Series):
+            if self._group_selection is not None:
+                return self.obj[self._group_selection]
             return self.obj
         else:
             return self.obj[self._selection]
 
     def _set_selection_from_grouper(self):
         """ we may need create a selection if we have non-level groupers """
         grp = self.grouper
-        if self._selection is None and self.as_index and getattr(grp,'groupings',None) is not None:
+        if self.as_index and getattr(grp,'groupings',None) is not None:
             ax = self.obj._info_axis
             groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ]
             if len(groupers):
-                self._selection = (ax-Index(groupers)).tolist()
+                self._group_selection = (ax-Index(groupers)).tolist()
 
     def _local_dir(self):
         return sorted(set(self.obj._local_dir() + list(self._apply_whitelist)))
@@ -776,6 +780,7 @@ def nth(self, n, dropna=None):
 
         """
 
+        self._set_selection_from_grouper()
         if not dropna:  # good choice
             m = self.grouper._max_groupsize
             if n >= m or n < -m:
@@ -787,7 +792,21 @@ def nth(self, n, dropna=None):
             else:
                 rng[- n - 1] = True
                 is_nth = self._cumcount_array(rng, ascending=False)
-            return self._selected_obj[is_nth]
+
+            result = self._selected_obj[is_nth]
+
+            # the result index
+            if self.as_index:
+                ax = self.obj._info_axis
+                names = self.grouper.names
+                if all([ n in ax for n in names ]):
+                    result.index = Index(self.obj[names][is_nth].values.ravel()).set_names(names)
+                elif self._group_selection is not None:
+                    result.index = self.obj._get_axis(self.axis)[is_nth]
+
+                result = result.sort_index()
+
+            return result
 
         if (isinstance(self._selected_obj, DataFrame)
            and dropna not in ['any', 'all']):
@@ -853,6 +872,7 @@ def cumcount(self, **kwargs):
         dtype: int64
 
         """
+        self._set_selection_from_grouper()
         ascending = kwargs.pop('ascending', True)
 
         index = self._selected_obj.index
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -166,18 +166,27 @@ def test_first_last_nth(self):
         # tests for first / last / nth
         grouped = self.df.groupby('A')
         first = grouped.first()
-        expected = self.df.ix[[1, 0], ['B', 'C', 'D']]
-        expected.index = ['bar', 'foo']
-        assert_frame_equal(first, expected, check_names=False)
+        expected = self.df.ix[[1, 0], ['B','C','D']]
+        expected.index = Index(['bar', 'foo'],name='A')
+        expected = expected.sort_index()
+        assert_frame_equal(first, expected)
+
+        nth = grouped.nth(0)
+        assert_frame_equal(nth, expected)
 
         last = grouped.last()
-        expected = self.df.ix[[5, 7], ['B', 'C', 'D']]
-        expected.index = ['bar', 'foo']
-        assert_frame_equal(last, expected, check_names=False)
+        expected = self.df.ix[[5, 7], ['B','C','D']]
+        expected.index = Index(['bar', 'foo'],name='A')
+        assert_frame_equal(last, expected)
+
+        nth = grouped.nth(-1)
+        assert_frame_equal(nth, expected)
 
         nth = grouped.nth(1)
-        expected = self.df.iloc[[2, 3]]
-        assert_frame_equal(nth, expected, check_names=False)
+        expected = self.df.ix[[2, 3],['B','C','D']].copy()
+        expected.index = Index(['foo', 'bar'],name='A')
+        expected = expected.sort_index()
+        assert_frame_equal(nth, expected)
 
         # it works!
         grouped['B'].first()
@@ -189,6 +198,17 @@ def test_first_last_nth(self):
         self.assert_(com.isnull(grouped['B'].last()['foo']))
         self.assert_(com.isnull(grouped['B'].nth(0)[0]))  # not sure what this is testing
 
+        # v0.14.0 whatsnew
+        df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
+        g = df.groupby('A')
+        result = g.first()
+        expected = df.iloc[[1,2]].set_index('A')
+        assert_frame_equal(result, expected)
+
+        expected = df.iloc[[1,2]].set_index('A')
+        result = g.nth(0,dropna='any')
+        assert_frame_equal(result, expected)
+
     def test_first_last_nth_dtypes(self):
 
         df = self.df_mixed_floats.copy()
@@ -199,17 +219,21 @@ def test_first_last_nth_dtypes(self):
         grouped = df.groupby('A')
         first = grouped.first()
         expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']]
-        expected.index = ['bar', 'foo']
-        assert_frame_equal(first, expected, check_names=False)
+        expected.index = Index(['bar', 'foo'], name='A')
+        expected = expected.sort_index()
+        assert_frame_equal(first, expected)
 
         last = grouped.last()
         expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']]
-        expected.index = ['bar', 'foo']
-        assert_frame_equal(last, expected, check_names=False)
+        expected.index = Index(['bar', 'foo'], name='A')
+        expected = expected.sort_index()
+        assert_frame_equal(last, expected)
 
         nth = grouped.nth(1)
-        expected = df.iloc[[2, 3]]
-        assert_frame_equal(nth, expected, check_names=False)
+        expected = df.ix[[3, 2],['B', 'C', 'D', 'E', 'F']]
+        expected.index = Index(['bar', 'foo'], name='A')
+        expected = expected.sort_index()
+        assert_frame_equal(nth, expected)
 
         # GH 2763, first/last shifting dtypes
         idx = lrange(10)
@@ -223,15 +247,15 @@ def test_nth(self):
         df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
         g = df.groupby('A')
 
-        assert_frame_equal(g.nth(0), df.iloc[[0, 2]])
-        assert_frame_equal(g.nth(1), df.iloc[[1]])
-        assert_frame_equal(g.nth(2), df.loc[[]])
-        assert_frame_equal(g.nth(-1), df.iloc[[1, 2]])
-        assert_frame_equal(g.nth(-2), df.iloc[[0]])
-        assert_frame_equal(g.nth(-3), df.loc[[]])
+        assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A'))
+        assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A'))
+        assert_frame_equal(g.nth(2), df.loc[[],['B']])
+        assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A'))
+        assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A'))
+        assert_frame_equal(g.nth(-3), df.loc[[],['B']])
         assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]])
         assert_series_equal(g.B.nth(1), df.B.iloc[[1]])
-        assert_frame_equal(g[['B']].nth(0), df.ix[[0, 2], ['B']])
+        assert_frame_equal(g[['B']].nth(0), df.ix[[0, 2], ['A', 'B']].set_index('A'))
 
         exp = df.set_index('A')
         assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])