ENH: add .ngroup() method to groupby objects (#14026)

dsm054 · dsm054 · commit 7aee0716525b · 2017-03-21T22:13:29.000-04:00
Closes #11642
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -1706,6 +1706,7 @@ Computations / Descriptive Stats
    GroupBy.mean
    GroupBy.median
    GroupBy.min
+   GroupBy.ngroup
    GroupBy.nth
    GroupBy.ohlc
    GroupBy.prod
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -1087,6 +1087,23 @@ To see the order in which each row appears within its group, use the
 
    df.groupby('A').cumcount(ascending=False)  # kwarg only
 
+Enumerate groups
+~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.20.0
+
+To see the ordering of the groups themselves, you can use the ``ngroup``
+method:
+
+.. ipython:: python
+
+   df = pd.DataFrame(list('aaabba'), columns=['A'])
+   df
+
+   df.groupby('A').ngroup()
+
+   df.groupby('A').ngroup(ascending=False)  # kwarg only
+
 Plotting
 ~~~~~~~~
 
@@ -1178,3 +1195,20 @@ column index name will be used as the name of the inserted column:
    result
 
    result.stack()
+
+Multi-column factorization
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By using ``.ngroup()``, we can extract information about the groups in a
+way similar to ``pd.factorize()``, but which applies naturally to multiple
+columns of mixed type and different sources:
+
+.. ipython::python
+
+    df = pd.DataFrame({"A": [1, 1, 2, 3, 2], "B": list("aaaba")})
+
+    df
+
+    df.groupby(["A", "B"]).ngroup()
+
+    df.groupby(["A", [0, 0, 0, 1, 1]]).ngroup()
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -56,8 +56,8 @@ fixed-width text files, and :func:`read_excel` for parsing Excel files.
 
 .. _whatsnew_0200.enhancements.groupby_access:
 
-Groupby Enhancements
-^^^^^^^^^^^^^^^^^^^^
+Groupby Access Enhancements
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now reference either column names or index level names (:issue:`5677`)
 
@@ -75,6 +75,21 @@ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now refere
 
    df.groupby(['second', 'A']).sum()
 
+.. _whatsnew_0200.enhancements.groupby_ngroup:
+
+Groupby Group Numbers
+^^^^^^^^^^^^^^^^^^^^^
+
+A new groupby method ``ngroup``, parallel to the existing ``cumcount``, has been added to return the group order (:issue:`11642`).
+
+.. ipython:: python
+
+   df = pd.DataFrame({"A": [1, 1, 2, 3, 3], "B": list("aaaba")})
+
+   df.groupby("A").ngroup()
+
+   df.groupby(["A", "B"]).ngroup()
+
 .. _whatsnew_0200.enhancements.compressed_urls:
 
 Better support for compressed URLs in ``read_csv``
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1363,6 +1363,62 @@ def nth(self, n, dropna=None):
 
         return result
 
+    @Substitution(name='groupby')
+    @Appender(_doc_template)
+    def ngroup(self, ascending=True):
+        """
+        Number each group from 0 to the number of groups - 1.
+
+        This is the enumerative complement of cumcount.  Note that the
+        numbers given to the groups match the order in which the groups
+        would be seen when iterating over the groupby object, not the
+        order they are first observed.
+
+        .. versionadded:: 0.20.0
+
+        Parameters
+        ----------
+        ascending : bool, default True
+            If False, number in reverse, from number of group - 1 to 0.
+
+        Examples
+        --------
+
+        >>> df = pd.DataFrame({"A": list("aaabba")})
+        >>> df
+           A
+        0  a
+        1  a
+        2  a
+        3  b
+        4  b
+        5  a
+        >>> df.groupby('A').ngroup()
+        0    0
+        1    0
+        2    0
+        3    1
+        4    1
+        5    0
+        dtype: int64
+        >>> df.groupby('A').ngroup(ascending=False)
+        0    1
+        1    1
+        2    1
+        3    0
+        4    0
+        5    1
+        dtype: int64
+        """
+
+        self._set_group_selection()
+
+        index = self._selected_obj.index
+        result = Series(self.grouper.group_info[0], index)
+        if not ascending:
+            result = self.ngroups - 1 - result
+        return result
+
     @Substitution(name='groupby')
     @Appender(_doc_template)
     def cumcount(self, ascending=True):
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py