ENH: Implement DataFrame.value_counts (#31247)

dsaxton · web-flow · commit 8b200c106c6a · 2020-02-25T21:20:43.000-05:00
diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst
@@ -689,6 +689,17 @@ of a 1D array of values. It can also be used as a function on regular arrays:
    s.value_counts()
    pd.value_counts(data)
 
+.. versionadded:: 1.1.0
+
+The :meth:`~DataFrame.value_counts` method can be used to count combinations across multiple columns.
+By default all columns are used but a subset can be selected using the ``subset`` argument.
+
+.. ipython:: python
+
+    data = {"a": [1, 2, 3, 4], "b": ["x", "x", "y", "y"]}
+    frame = pd.DataFrame(data)
+    frame.value_counts()
+
 Similarly, you can get the most frequently occurring value(s) (the mode) of the values in a Series or DataFrame:
 
 .. ipython:: python
diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -170,6 +170,7 @@ Computations / descriptive stats
    DataFrame.std
    DataFrame.var
    DataFrame.nunique
+   DataFrame.value_counts
 
 Reindexing / selection / label manipulation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -55,6 +55,7 @@ Other API changes
 
 - :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
   will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
+- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
 - :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`)
 - ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`)
 -
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -1196,6 +1196,7 @@ def value_counts(
         --------
         Series.count: Number of non-NA elements in a Series.
         DataFrame.count: Number of non-NA elements in a DataFrame.
+        DataFrame.value_counts: Equivalent method on DataFrames.
 
         Examples
         --------
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -111,7 +111,7 @@
 from pandas.core.indexes import base as ibase
 from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences
 from pandas.core.indexes.datetimes import DatetimeIndex
-from pandas.core.indexes.multi import maybe_droplevels
+from pandas.core.indexes.multi import MultiIndex, maybe_droplevels
 from pandas.core.indexes.period import PeriodIndex
 from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
 from pandas.core.internals import BlockManager
@@ -4569,6 +4569,10 @@ def drop_duplicates(
         -------
         DataFrame
             DataFrame with duplicates removed or None if ``inplace=True``.
+
+        See Also
+        --------
+        DataFrame.value_counts: Count unique combinations of columns.
         """
         if self.empty:
             return self.copy()
@@ -4814,6 +4818,102 @@ def sort_index(
         else:
             return self._constructor(new_data).__finalize__(self)
 
+    def value_counts(
+        self,
+        subset: Optional[Sequence[Label]] = None,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+    ):
+        """
+        Return a Series containing counts of unique rows in the DataFrame.
+
+        .. versionadded:: 1.1.0
+
+        Parameters
+        ----------
+        subset : list-like, optional
+            Columns to use when counting unique combinations.
+        normalize : bool, default False
+            Return proportions rather than frequencies.
+        sort : bool, default True
+            Sort by frequencies.
+        ascending : bool, default False
+            Sort in ascending order.
+
+        Returns
+        -------
+        Series
+
+        See Also
+        --------
+        Series.value_counts: Equivalent method on Series.
+
+        Notes
+        -----
+        The returned Series will have a MultiIndex with one level per input
+        column. By default, rows that contain any NA values are omitted from
+        the result. By default, the resulting Series will be in descending
+        order so that the first element is the most frequently-occurring row.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],
+        ...                    'num_wings': [2, 0, 0, 0]},
+        ...                   index=['falcon', 'dog', 'cat', 'ant'])
+        >>> df
+                num_legs  num_wings
+        falcon         2          2
+        dog            4          0
+        cat            4          0
+        ant            6          0
+
+        >>> df.value_counts()
+        num_legs  num_wings
+        4         0            2
+        6         0            1
+        2         2            1
+        dtype: int64
+
+        >>> df.value_counts(sort=False)
+        num_legs  num_wings
+        2         2            1
+        4         0            2
+        6         0            1
+        dtype: int64
+
+        >>> df.value_counts(ascending=True)
+        num_legs  num_wings
+        2         2            1
+        6         0            1
+        4         0            2
+        dtype: int64
+
+        >>> df.value_counts(normalize=True)
+        num_legs  num_wings
+        4         0            0.50
+        6         0            0.25
+        2         2            0.25
+        dtype: float64
+        """
+        if subset is None:
+            subset = self.columns.tolist()
+
+        counts = self.groupby(subset).size()
+
+        if sort:
+            counts = counts.sort_values(ascending=ascending)
+        if normalize:
+            counts /= counts.sum()
+
+        # Force MultiIndex for single column
+        if len(subset) == 1:
+            counts.index = MultiIndex.from_arrays(
+                [counts.index], names=[counts.index.name]
+            )
+
+        return counts
+
     def nlargest(self, n, columns, keep="first") -> "DataFrame":
         """
         Return the first `n` rows ordered by `columns` in descending order.
diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py
@@ -0,0 +1,102 @@
+import numpy as np
+
+import pandas as pd
+import pandas._testing as tm
+
+
+def test_data_frame_value_counts_unsorted():
+    df = pd.DataFrame(
+        {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+        index=["falcon", "dog", "cat", "ant"],
+    )
+
+    result = df.value_counts(sort=False)
+    expected = pd.Series(
+        data=[1, 2, 1],
+        index=pd.MultiIndex.from_arrays(
+            [(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"]
+        ),
+    )
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_data_frame_value_counts_ascending():
+    df = pd.DataFrame(
+        {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+        index=["falcon", "dog", "cat", "ant"],
+    )
+
+    result = df.value_counts(ascending=True)
+    expected = pd.Series(
+        data=[1, 1, 2],
+        index=pd.MultiIndex.from_arrays(
+            [(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"]
+        ),
+    )
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_data_frame_value_counts_default():
+    df = pd.DataFrame(
+        {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+        index=["falcon", "dog", "cat", "ant"],
+    )
+
+    result = df.value_counts()
+    expected = pd.Series(
+        data=[2, 1, 1],
+        index=pd.MultiIndex.from_arrays(
+            [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
+        ),
+    )
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_data_frame_value_counts_normalize():
+    df = pd.DataFrame(
+        {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+        index=["falcon", "dog", "cat", "ant"],
+    )
+
+    result = df.value_counts(normalize=True)
+    expected = pd.Series(
+        data=[0.5, 0.25, 0.25],
+        index=pd.MultiIndex.from_arrays(
+            [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
+        ),
+    )
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_data_frame_value_counts_single_col_default():
+    df = pd.DataFrame({"num_legs": [2, 4, 4, 6]})
+
+    result = df.value_counts()
+    expected = pd.Series(
+        data=[2, 1, 1],
+        index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]),
+    )
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_data_frame_value_counts_empty():
+    df_no_cols = pd.DataFrame()
+
+    result = df_no_cols.value_counts()
+    expected = pd.Series([], dtype=np.int64)
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_data_frame_value_counts_empty_normalize():
+    df_no_cols = pd.DataFrame()
+
+    result = df_no_cols.value_counts(normalize=True)
+    expected = pd.Series([], dtype=np.float64)
+
+    tm.assert_series_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -55,6 +55,7 @@ Other API changes`
`55`	`55`
`56`	`56`	- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
`57`	`57`	will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
	`58`	+- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
`58`	`59`	- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`)
`59`	`60`	- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`)
`60`	`61`	`-`