BUG/TST: fix tests for groupby nth on Series (GH7559)

jreback · jreback · commit def0155586b6 · 2014-06-27T20:36:45.000-04:00
diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt
@@ -173,6 +173,7 @@ Bug Fixes
 - Bug in setitem with list-of-lists and single vs mixed types (:issue:`7551`:)
 - Bug in timeops with non-aligned Series (:issue:`7500`)
 - Bug in timedelta inference when assigning an incomplete Series (:issue:`7592`)
+- Bug in groupby ``.nth`` with a Series and integer-like column name (:issue:`7559`)
 
 - Bug in ``value_counts`` where ``NaT`` did not qualify as missing (``NaN``) (:issue:`7423`)
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -467,7 +467,7 @@ def _selected_obj(self):
     def _set_selection_from_grouper(self):
         """ we may need create a selection if we have non-level groupers """
         grp = self.grouper
-        if self.as_index and getattr(grp,'groupings',None) is not None:
+        if self.as_index and getattr(grp,'groupings',None) is not None and self.obj.ndim > 1:
             ax = self.obj._info_axis
             groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ]
             if len(groupers):
@@ -759,7 +759,7 @@ def nth(self, n, dropna=None):
 
         Examples
         --------
-        >>> DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
+        >>> df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
         >>> g = df.groupby('A')
         >>> g.nth(0)
            A   B
@@ -804,7 +804,10 @@ def nth(self, n, dropna=None):
             if self.as_index:
                 ax = self.obj._info_axis
                 names = self.grouper.names
-                if all([ n in ax for n in names ]):
+                if self.obj.ndim == 1:
+                    # this is a pass-thru
+                    pass
+                elif all([ n in ax for n in names ]):
                     result.index = Index(self.obj[names][is_nth].values.ravel()).set_names(names)
                 elif self._group_selection is not None:
                     result.index = self.obj._get_axis(self.axis)[is_nth]
@@ -821,17 +824,29 @@ def nth(self, n, dropna=None):
                              "(was passed %s)." % (dropna),)
 
         # old behaviour, but with all and any support for DataFrames.
-
+        # modified in GH 7559 to have better perf
         max_len = n if n >= 0 else - 1 - n
+        dropped = self.obj.dropna(how=dropna, axis=self.axis)
 
-        def picker(x):
-            x = x.dropna(how=dropna)  # Note: how is ignored if Series
-            if len(x) <= max_len:
-                return np.nan
-            else:
-                return x.iloc[n]
+        # get a new grouper for our dropped obj
+        grouper, exclusions, obj = _get_grouper(dropped, key=self.keys, axis=self.axis,
+                                                level=self.level, sort=self.sort)
+
+        sizes = obj.groupby(grouper).size()
+        result = obj.groupby(grouper).nth(n)
+        mask = (sizes<max_len).values
+
+        # set the results which don't meet the criteria
+        if len(result) and mask.any():
+            result.loc[mask] = np.nan
 
-        return self.agg(picker)
+        # reset/reindex to the original groups
+        if len(self.obj) == len(dropped):
+            result.index = self.grouper.result_index
+        else:
+            result = result.reindex(self.grouper.result_index)
+
+        return result
 
     def cumcount(self, **kwargs):
         """
@@ -942,21 +957,33 @@ def tail(self, n=5):
     def _cumcount_array(self, arr=None, **kwargs):
         """
         arr is where cumcount gets it's values from
+
+        note: this is currently implementing sort=False (though the default is sort=True)
+              for groupby in general
         """
         ascending = kwargs.pop('ascending', True)
 
         if arr is None:
             arr = np.arange(self.grouper._max_groupsize, dtype='int64')
 
         len_index = len(self._selected_obj.index)
-        cumcounts = np.empty(len_index, dtype=arr.dtype)
+        cumcounts = np.zeros(len_index, dtype=arr.dtype)
+        if not len_index:
+            return cumcounts
+
+        indices, values = [], []
+        for v in self.indices.values():
+            indices.append(v)
+
+            if ascending:
+                values.append(arr[:len(v)])
+            else:
+                values.append(arr[len(v)-1::-1])
+
+        indices = np.concatenate(indices)
+        values = np.concatenate(values)
+        cumcounts[indices] = values
 
-        if ascending:
-            for v in self.indices.values():
-                cumcounts[v] = arr[:len(v)]
-        else:
-            for v in self.indices.values():
-                cumcounts[v] = arr[len(v)-1::-1]
         return cumcounts
 
     def _index_with_as_index(self, b):
@@ -1270,6 +1297,7 @@ def group_info(self):
         comp_ids = com._ensure_int64(comp_ids)
         return comp_ids, obs_group_ids, ngroups
 
+
     def _get_compressed_labels(self):
         all_labels = [ping.labels for ping in self.groupings]
         if self._overflow_possible:
@@ -1892,7 +1920,6 @@ def groups(self):
             self._groups = self.index.groupby(self.grouper)
         return self._groups
 
-
 def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
     """
     create and return a BaseGrouper, which is an internal
@@ -2141,7 +2168,10 @@ def _wrap_aggregated_output(self, output, names=None):
         if names is not None:
             return DataFrame(output, index=index, columns=names)
         else:
-            return Series(output, index=index, name=self.name)
+            name = self.name
+            if name is None:
+                name = self._selected_obj.name
+            return Series(output, index=index, name=name)
 
     def _wrap_applied_output(self, keys, values, not_indexed_same=False):
         if len(keys) == 0:
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -282,6 +282,28 @@ def test_nth(self):
         expected = df.loc[[]]
         assert_frame_equal(result,expected)
 
+        # GH 7559
+        # from the vbench
+        df = DataFrame(np.random.randint(1, 10, (100, 2)))
+        s = df[1]
+        g = df[0]
+        expected = s.groupby(g).first()
+        expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
+        assert_series_equal(expected2,expected)
+
+        # validate first
+        v = s[g==1].iloc[0]
+        self.assertEqual(expected.iloc[0],v)
+        self.assertEqual(expected2.iloc[0],v)
+
+        # this is NOT the same as .first (as sorted is default!)
+        # as it keeps the order in the series (and not the group order)
+        # related GH 7287
+        expected = s.groupby(g,sort=False).first()
+        expected.index = range(1,10)
+        result = s.groupby(g).nth(0,dropna='all')
+        assert_series_equal(result,expected)
+
     def test_grouper_index_types(self):
         # related GH5375
         # groupby misbehaving when using a Floatlike index
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
@@ -244,11 +244,14 @@ def f():
 groupby_last_float32 = Benchmark('data2.groupby(labels).last()', setup,
                                  start_date=datetime(2013, 1, 1))
 
-groupby_nth_float64 = Benchmark('data.groupby(labels).nth(0)', setup,
-                         start_date=datetime(2012, 5, 1))
-
-groupby_nth_float32 = Benchmark('data2.groupby(labels).nth(0)', setup,
-                                 start_date=datetime(2013, 1, 1))
+groupby_nth_float64_none = Benchmark('data.groupby(labels).nth(0)', setup,
+                                     start_date=datetime(2012, 5, 1))
+groupby_nth_float32_none = Benchmark('data2.groupby(labels).nth(0)', setup,
+                                     start_date=datetime(2013, 1, 1))
+groupby_nth_float64_any = Benchmark('data.groupby(labels).nth(0,dropna="all")', setup,
+                                     start_date=datetime(2012, 5, 1))
+groupby_nth_float32_any = Benchmark('data2.groupby(labels).nth(0,dropna="all")', setup,
+                                    start_date=datetime(2013, 1, 1))
 
 # with datetimes (GH7555)
 setup = common_setup + """
@@ -259,8 +262,10 @@ def f():
                                  start_date=datetime(2013, 5, 1))
 groupby_last_datetimes = Benchmark('df.groupby("b").last()', setup,
                                  start_date=datetime(2013, 5, 1))
-groupby_nth_datetimes = Benchmark('df.groupby("b").nth(0)', setup,
-                                 start_date=datetime(2013, 5, 1))
+groupby_nth_datetimes_none = Benchmark('df.groupby("b").nth(0)', setup,
+                                       start_date=datetime(2013, 5, 1))
+groupby_nth_datetimes_any = Benchmark('df.groupby("b").nth(0,dropna="all")', setup,
+                                      start_date=datetime(2013, 5, 1))
 
 # with object
 setup = common_setup + """
@@ -271,8 +276,10 @@ def f():
                                  start_date=datetime(2013, 5, 1))
 groupby_last_object = Benchmark('df.groupby("b").last()', setup,
                                  start_date=datetime(2013, 5, 1))
-groupby_nth_object = Benchmark('df.groupby("b").nth(0)', setup,
-                                 start_date=datetime(2013, 5, 1))
+groupby_nth_object_none = Benchmark('df.groupby("b").nth(0)', setup,
+                                    start_date=datetime(2013, 5, 1))
+groupby_nth_object_any = Benchmark('df.groupby("b").nth(0,dropna="any")', setup,
+                                   start_date=datetime(2013, 5, 1))
 
 #----------------------------------------------------------------------
 # groupby_indices replacement, chop up Series
@@ -351,11 +358,16 @@ def f(g):
 """
 
 # Not really a fair test as behaviour has changed!
-groupby_frame_nth = Benchmark("df.groupby(0).nth(0)", setup,
-                              start_date=datetime(2014, 3, 1))
+groupby_frame_nth_none = Benchmark("df.groupby(0).nth(0)", setup,
+                                   start_date=datetime(2014, 3, 1))
+
+groupby_series_nth_none = Benchmark("df[1].groupby(df[0]).nth(0)", setup,
+                                    start_date=datetime(2014, 3, 1))
+groupby_frame_nth_any= Benchmark("df.groupby(0).nth(0,dropna='any')", setup,
+                                 start_date=datetime(2014, 3, 1))
 
-groupby_series_nth = Benchmark("df[1].groupby(df[0]).nth(0)", setup,
-                               start_date=datetime(2014, 3, 1))
+groupby_series_nth_any = Benchmark("df[1].groupby(df[0]).nth(0,dropna='any')", setup,
+                                   start_date=datetime(2014, 3, 1))
 
 
 #----------------------------------------------------------------------