ENH: Add groupby().enumerate method to count groups (#11642)

dsm054 · dsm054 · commit a6e60a7adfe0 · 2016-08-17T21:41:23.000-04:00
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -1682,6 +1682,7 @@ Computations / Descriptive Stats
 
    GroupBy.count
    GroupBy.cumcount
+   GroupBy.enumerate
    GroupBy.first
    GroupBy.head
    GroupBy.last
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -969,7 +969,7 @@ Enumerate group items
 .. versionadded:: 0.13.0
 
 To see the order in which each row appears within its group, use the
-``cumcount`` method:
+``cumcount`` method (compare with ``enumerate``):
 
 .. ipython:: python
 
@@ -980,6 +980,23 @@ To see the order in which each row appears within its group, use the
 
    df.groupby('A').cumcount(ascending=False)  # kwarg only
 
+Enumerate groups
+~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.19.0
+
+To see the ordering of the groups themselves, you can use the ``enumerate``
+method (compare with ``cumcount``):
+
+.. ipython:: python
+
+   df = pd.DataFrame(list('aaabba'), columns=['A'])
+   df
+
+   df.groupby('A').enumerate()
+
+   df.groupby('A').enumerate(ascending=False)  # kwarg only
+
 Plotting
 ~~~~~~~~
 
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -389,6 +389,7 @@ Other enhancements
 
 - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)
 - ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`)
+- A new groupby method ``enumerate``, parallel to the existing ``cumcount``, has been added to return the group order (:issue:`11642`)
 - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`)
 - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`)
 - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1329,6 +1329,74 @@ def nth(self, n, dropna=None):
 
         return result
 
+    @Substitution(name='groupby')
+    @Appender(_doc_template)
+    def enumerate(self, ascending=True):
+        """
+        Number each group from 0 to the number of groups - 1.
+
+        This is the enumerative complement of cumcount.  Note that the
+        numbers given to the groups match the order in which the groups
+        would be seen when iterating over the groupby object, not the
+        order they are first observed.
+
+        Parameters
+        ----------
+        ascending : bool, default True
+            If False, number in reverse, from number of group - 1 to 0.
+
+        Examples
+        --------
+
+        >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
+        ...                   columns=['A'])
+        >>> df
+           A
+        0  a
+        1  a
+        2  a
+        3  b
+        4  b
+        5  a
+        >>> df.groupby('A').enumerate()
+        0    0
+        1    0
+        2    0
+        3    1
+        4    1
+        5    0
+        dtype: int64
+        >>> df.groupby('A').enumerate(ascending=False)
+        0    1
+        1    1
+        2    1
+        3    0
+        4    0
+        5    1
+        dtype: int64
+        >>> df = pd.DataFrame([['b'], ['a'], ['a'], ['b']], columns=['A'])
+        >>> df
+           A
+        0  b
+        1  a
+        2  a
+        3  b
+        >>> df.groupby("A").enumerate()
+        0    1
+        1    0
+        2    0
+        3    1
+        dtype: int64
+        """
+
+        self._set_group_selection()
+
+        index = self._selected_obj.index
+        result = Series(self.grouper.group_info[0], index)
+        if not ascending:
+            result = self.ngroups - 1 - result
+        return result
+
     @Substitution(name='groupby')
     @Appender(_doc_template)
     def cumcount(self, ascending=True):
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -5141,6 +5141,118 @@ def test_cumcount_groupby_not_col(self):
         assert_series_equal(expected, g.cumcount())
         assert_series_equal(expected, sg.cumcount())
 
+    def test_enumerate(self):
+        df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
+        g = df.groupby('A')
+        sg = g.A
+
+        expected = Series([0, 0, 0, 1, 0])
+
+        assert_series_equal(expected, g.enumerate())
+        assert_series_equal(expected, sg.enumerate())
+
+    def test_enumerate_empty(self):
+        ge = DataFrame().groupby(level=0)
+        se = Series().groupby(level=0)
+
+        # edge case, as this is usually considered float
+        e = Series(dtype='int64')
+
+        assert_series_equal(e, ge.enumerate())
+        assert_series_equal(e, se.enumerate())
+
+    def test_enumerate_dupe_index(self):
+        df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
+                       index=[0] * 5)
+        g = df.groupby('A')
+        sg = g.A
+
+        expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
+
+        assert_series_equal(expected, g.enumerate())
+        assert_series_equal(expected, sg.enumerate())
+
+    def test_enumerate_mi(self):
+        mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
+        df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
+                       index=mi)
+        g = df.groupby('A')
+        sg = g.A
+
+        expected = Series([0, 0, 0, 1, 0], index=mi)
+
+        assert_series_equal(expected, g.enumerate())
+        assert_series_equal(expected, sg.enumerate())
+
+    def test_enumerate_groupby_not_col(self):
+        df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
+                       index=[0] * 5)
+        g = df.groupby([0, 0, 0, 1, 0])
+        sg = g.A
+
+        expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
+
+        assert_series_equal(expected, g.enumerate())
+        assert_series_equal(expected, sg.enumerate())
+
+    def test_enumerate_descending(self):
+        df = DataFrame(['a', 'a', 'b', 'a', 'b'], columns=['A'])
+        g = df.groupby(['A'])
+
+        ascending = Series([0, 0, 1, 0, 1])
+        descending = Series([1, 1, 0, 1, 0])
+
+        assert_series_equal(descending, (g.ngroups - 1) - ascending)
+        assert_series_equal(ascending, g.enumerate(ascending=True))
+        assert_series_equal(descending, g.enumerate(ascending=False))
+
+    def test_enumerate_matches_cumcount(self):
+        # specific case
+        df = DataFrame([['a', 'x'], ['a', 'y'], ['b', 'x'],
+                        ['a', 'x'], ['b', 'y']], columns=['A', 'X'])
+        g = df.groupby(['A', 'X'])
+
+        g_enumerate = g.enumerate()
+        g_cumcount = g.cumcount()
+        expected_enumerate = pd.Series([0, 1, 2, 0, 3])
+        expected_cumcount = pd.Series([0, 0, 0, 1, 0])
+
+        assert_series_equal(g_enumerate, expected_enumerate)
+        assert_series_equal(g_cumcount, expected_cumcount)
+
+    def test_enumerate_cumcount_pair(self):
+        from itertools import product
+
+        # brute force comparison, inefficient but clear
+        for p in product(range(3), repeat=4):
+            df = DataFrame({'a': p})
+            g = df.groupby(['a'])
+
+            order = sorted(set(p))
+            enumerated = [order.index(val) for val in p]
+            cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
+
+            assert_series_equal(g.enumerate(), pd.Series(enumerated))
+            assert_series_equal(g.cumcount(), pd.Series(cumcounted))
+
+    def test_enumerate_respects_groupby_order(self):
+        np.random.seed(0)
+        df = DataFrame({'a': np.random.choice(list('abcdef'), 100)})
+        for sort_flag in (False, True):
+            g = df.groupby(['a'], sort=sort_flag)
+            df['group_id'] = -1
+            df['group_index'] = -1
+
+            for i, (key, group) in enumerate(g):
+                df.loc[group.index, 'group_id'] = i
+                for j, ind in enumerate(group.index):
+                    df.loc[ind, 'group_index'] = j
+
+            assert_series_equal(pd.Series(df['group_id'].values),
+                                g.enumerate())
+            assert_series_equal(pd.Series(df['group_index'].values),
+                                g.cumcount())
+
     def test_filter_series(self):
         s = pd.Series([1, 3, 20, 5, 22, 24, 7])
         expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6])