PERF: improves performance in GroupBy.cumcount

behzadnouri · jreback · commit 445d1c6d0063 · 2016-04-25T10:19:41.000-04:00
closes #12839 closes #11039
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -131,6 +131,79 @@ These changes conform sparse handling to return the correct types and work to ma
 - Bug in ``pd.concat()`` of ``SparseDataFrame`` may raise ``AttributeError`` (:issue:`12174`)
 - Bug in ``SparseArray.shift()`` may raise ``NameError`` or ``TypeError`` (:issue:`12908`)
 
+.. _whatsnew_0181.api.groubynth:
+
+``.groupby(..).nth()`` changes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The index in ``.groupby(..).nth()`` output is now more consistent when the ``as_index`` argument is passed (:issue:`11039`):
+
+.. ipython:: python
+
+   df = DataFrame({'A' : ['a', 'b', 'a'],
+                   'B' : [1, 2, 3]})
+   df
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+   In [3]: df.groupby('A', as_index=True)['B'].nth(0)
+   Out[3]:
+   0    1
+   1    2
+   Name: B, dtype: int64
+
+   In [4]: df.groupby('A', as_index=False)['B'].nth(0)
+   Out[4]:
+   0    1
+   1    2
+   Name: B, dtype: int64
+
+New Behavior:
+
+.. ipython:: ipython
+
+    df.groupby('A', as_index=True)['B'].nth(0)
+    df.groupby('A', as_index=False)['B'].nth(0)
+
+Furthermore, previously, a ``.groupby`` would always sort, regardless if ``sort=False`` was passed with ``.nth()``.
+
+.. ipython:: python
+
+   np.random.seed(1234)
+   df = pd.DataFrame(np.random.randn(100, 2), columns=['a', 'b'])
+   df['c'] = np.random.randint(0, 4, 100)
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+   In [4]: df.groupby('c', sort=True).nth(1)
+   Out[4]:
+             a         b
+   c
+   0 -0.334077  0.002118
+   1  0.036142 -2.074978
+   2 -0.720589  0.887163
+   3  0.859588 -0.636524
+
+   In [5]: df.groupby('c', sort=False).nth(1)
+   Out[5]:
+             a         b
+   c
+   0 -0.334077  0.002118
+   1  0.036142 -2.074978
+   2 -0.720589  0.887163
+   3  0.859588 -0.636524
+
+New Behavior:
+
+.. ipython:: python
+
+   df.groupby('c', sort=True).nth(1)
+   df.groupby('c', sort=False).nth(1)
+
 .. _whatsnew_0181.api:
 
 API changes
@@ -255,6 +328,8 @@ Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
 - Improved speed of SAS reader (:issue:`12656`)
+- Performance improvements in ``.groupby(..).cumcount()`` (:issue:`11039`)
+
 
 - Improved performance of ``DataFrame.to_sql`` when checking case sensitivity for tables. Now only checks if table has been created correctly when table name is not lower case. (:issue:`12876`)
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -653,37 +653,37 @@ def _iterate_slices(self):
     def transform(self, func, *args, **kwargs):
         raise AbstractMethodError(self)
 
-    def _cumcount_array(self, arr=None, ascending=True):
+    def _cumcount_array(self, ascending=True):
         """
-        arr is where cumcount gets its values from
+        Parameters
+        ----------
+        ascending : bool, default True
+            If False, number in reverse, from length of group - 1 to 0.
 
         Note
         ----
         this is currently implementing sort=False
         (though the default is sort=True) for groupby in general
         """
-        if arr is None:
-            arr = np.arange(self.grouper._max_groupsize, dtype='int64')
-
-        len_index = len(self._selected_obj.index)
-        cumcounts = np.zeros(len_index, dtype=arr.dtype)
-        if not len_index:
-            return cumcounts
+        ids, _, ngroups = self.grouper.group_info
+        sorter = _get_group_index_sorter(ids, ngroups)
+        ids, count = ids[sorter], len(ids)
 
-        indices, values = [], []
-        for v in self.indices.values():
-            indices.append(v)
+        if count == 0:
+            return np.empty(0, dtype=np.int64)
 
-            if ascending:
-                values.append(arr[:len(v)])
-            else:
-                values.append(arr[len(v) - 1::-1])
+        run = np.r_[True, ids[:-1] != ids[1:]]
+        rep = np.diff(np.r_[np.nonzero(run)[0], count])
+        out = (~run).cumsum()
 
-        indices = np.concatenate(indices)
-        values = np.concatenate(values)
-        cumcounts[indices] = values
+        if ascending:
+            out -= np.repeat(out[run], rep)
+        else:
+            out = np.repeat(out[np.r_[run[1:], True]], rep) - out
 
-        return cumcounts
+        rev = np.empty(count, dtype=np.intp)
+        rev[sorter] = np.arange(count, dtype=np.intp)
+        return out[rev].astype(np.int64, copy=False)
 
     def _index_with_as_index(self, b):
         """
@@ -1170,47 +1170,21 @@ def nth(self, n, dropna=None):
         else:
             raise TypeError("n needs to be an int or a list/set/tuple of ints")
 
-        m = self.grouper._max_groupsize
-        # filter out values that are outside [-m, m)
-        pos_nth_values = [i for i in nth_values if i >= 0 and i < m]
-        neg_nth_values = [i for i in nth_values if i < 0 and i >= -m]
-
+        nth_values = np.array(nth_values, dtype=np.intp)
         self._set_selection_from_grouper()
-        if not dropna:  # good choice
-            if not pos_nth_values and not neg_nth_values:
-                # no valid nth values
-                return self._selected_obj.loc[[]]
-
-            rng = np.zeros(m, dtype=bool)
-            for i in pos_nth_values:
-                rng[i] = True
-            is_nth = self._cumcount_array(rng)
 
-            if neg_nth_values:
-                rng = np.zeros(m, dtype=bool)
-                for i in neg_nth_values:
-                    rng[- i - 1] = True
-                is_nth |= self._cumcount_array(rng, ascending=False)
+        if not dropna:
+            mask = np.in1d(self._cumcount_array(), nth_values) | \
+                np.in1d(self._cumcount_array(ascending=False) + 1, -nth_values)
 
-            result = self._selected_obj[is_nth]
+            out = self._selected_obj[mask]
+            if not self.as_index:
+                return out
 
-            # the result index
-            if self.as_index:
-                ax = self.obj._info_axis
-                names = self.grouper.names
-                if self.obj.ndim == 1:
-                    # this is a pass-thru
-                    pass
-                elif all([x in ax for x in names]):
-                    indicies = [self.obj[name][is_nth] for name in names]
-                    result.index = MultiIndex.from_arrays(
-                        indicies).set_names(names)
-                elif self._group_selection is not None:
-                    result.index = self.obj._get_axis(self.axis)[is_nth]
-
-                result = result.sort_index()
+            ids, _, _ = self.grouper.group_info
+            out.index = self.grouper.result_index[ids[mask]]
 
-            return result
+            return out.sort_index() if self.sort else out
 
         if isinstance(self._selected_obj, DataFrame) and \
            dropna not in ['any', 'all']:
@@ -1241,8 +1215,8 @@ def nth(self, n, dropna=None):
                                          axis=self.axis, level=self.level,
                                          sort=self.sort)
 
-        sizes = dropped.groupby(grouper).size()
-        result = dropped.groupby(grouper).nth(n)
+        grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort)
+        sizes, result = grb.size(), grb.nth(n)
         mask = (sizes < max_len).values
 
         # set the results which don't meet the criteria
@@ -1380,11 +1354,8 @@ def head(self, n=5):
         0  1  2
         2  5  6
         """
-
-        obj = self._selected_obj
-        in_head = self._cumcount_array() < n
-        head = obj[in_head]
-        return head
+        mask = self._cumcount_array() < n
+        return self._selected_obj[mask]
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
@@ -1409,12 +1380,8 @@ def tail(self, n=5):
         0  a  1
         2  b  1
         """
-
-        obj = self._selected_obj
-        rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64')
-        in_tail = self._cumcount_array(rng, ascending=False) > -n
-        tail = obj[in_tail]
-        return tail
+        mask = self._cumcount_array(ascending=False) < n
+        return self._selected_obj[mask]
 
 
 @Appender(GroupBy.__doc__)
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -167,8 +167,7 @@ def test_first_last_nth(self):
         self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan
         self.assertTrue(com.isnull(grouped['B'].first()['foo']))
         self.assertTrue(com.isnull(grouped['B'].last()['foo']))
-        self.assertTrue(com.isnull(grouped['B'].nth(0)[0])
-                        )  # not sure what this is testing
+        self.assertTrue(com.isnull(grouped['B'].nth(0)['foo']))
 
         # v0.14.0 whatsnew
         df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
@@ -221,12 +220,12 @@ def test_nth(self):
 
         assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A'))
         assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A'))
-        assert_frame_equal(g.nth(2), df.loc[[], ['B']])
+        assert_frame_equal(g.nth(2), df.loc[[]].set_index('A'))
         assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A'))
         assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A'))
-        assert_frame_equal(g.nth(-3), df.loc[[], ['B']])
-        assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]])
-        assert_series_equal(g.B.nth(1), df.B.iloc[[1]])
+        assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A'))
+        assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]])
+        assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]])
         assert_frame_equal(g[['B']].nth(0),
                            df.ix[[0, 2], ['A', 'B']].set_index('A'))
 
@@ -262,11 +261,11 @@ def test_nth(self):
                                 4: 0.70422799999999997}}).set_index(['color',
                                                                      'food'])
 
-        result = df.groupby(level=0).nth(2)
+        result = df.groupby(level=0, as_index=False).nth(2)
         expected = df.iloc[[-1]]
         assert_frame_equal(result, expected)
 
-        result = df.groupby(level=0).nth(3)
+        result = df.groupby(level=0, as_index=False).nth(3)
         expected = df.loc[[]]
         assert_frame_equal(result, expected)
 
@@ -290,8 +289,7 @@ def test_nth(self):
         # as it keeps the order in the series (and not the group order)
         # related GH 7287
         expected = s.groupby(g, sort=False).first()
-        expected.index = pd.Index(range(1, 10), name=0)
-        result = s.groupby(g).nth(0, dropna='all')
+        result = s.groupby(g, sort=False).nth(0, dropna='all')
         assert_series_equal(result, expected)
 
         # doc example
@@ -316,14 +314,14 @@ def test_nth(self):
         assert_frame_equal(
             g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
         assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A'))
-        assert_frame_equal(g.nth([3, 4]), df.loc[[], ['B']])
+        assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A'))
 
         business_dates = pd.date_range(start='4/1/2014', end='6/30/2014',
                                        freq='B')
         df = DataFrame(1, index=business_dates, columns=['a', 'b'])
         # get the first, fourth and last two business days for each month
-        result = df.groupby((df.index.year, df.index.month)).nth([0, 3, -2, -1
-                                                                  ])
+        key = (df.index.year, df.index.month)
+        result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
         expected_dates = pd.to_datetime(
             ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1',
              '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5',