ENH: Add mode method to Series and DataFrame

jtratner · jtratner · commit 7841d9acfff7 · 2013-11-04T23:23:46.000-05:00
Abstract building up the tables with counts so value_count and mode can
share. Aww! DOC: Add documentation about mode()
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -348,6 +348,7 @@ Computations / Descriptive Stats
    Series.mean
    Series.median
    Series.min
+   Series.mode
    Series.nunique
    Series.pct_change
    Series.prod
@@ -632,6 +633,7 @@ Computations / Descriptive Stats
    DataFrame.mean
    DataFrame.median
    DataFrame.min
+   DataFrame.mode
    DataFrame.pct_change
    DataFrame.prod
    DataFrame.quantile
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -378,6 +378,7 @@ optional ``level`` parameter which applies only if the object has a
     ``median``, Arithmetic median of values
     ``min``, Minimum
     ``max``, Maximum
+    ``mode``, Mode
     ``abs``, Absolute Value
     ``prod``, Product of values
     ``std``, Unbiased standard deviation
@@ -473,8 +474,8 @@ value, ``idxmin`` and ``idxmax`` return the first matching index:
 
 .. _basics.discretization:
 
-Value counts (histogramming)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Value counts (histogramming) / Mode
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The ``value_counts`` Series method and top-level function computes a histogram
 of a 1D array of values. It can also be used as a function on regular arrays:
@@ -487,6 +488,17 @@ of a 1D array of values. It can also be used as a function on regular arrays:
    s.value_counts()
    value_counts(data)
 
+Similarly, you can get the most frequently occuring value(s) (the mode) of the values in a Series or DataFrame:
+
+.. ipython:: python
+
+    data = [1, 1, 3, 3, 3, 5, 5, 7, 7, 7]
+    s = Series(data)
+    s.mode()
+    df = pd.DataFrame({"A": np.random.randint(0, 7, size=50),
+                       "B": np.random.randint(-10, 15, size=50)})
+    df.mode()
+
 
 Discretization and quantiling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -514,6 +526,7 @@ normally distributed data into equal-size quartiles like so:
    value_counts(factor)
 
 We can also pass infinite values to define the bins:
+
 .. ipython:: python
 
    arr = np.random.randn(20)
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -65,6 +65,8 @@ New features
     string via the ``date_format`` keyword (:issue:`4313`)
   - Added ``LastWeekOfMonth`` DateOffset (:issue:`4637`)
   - Added ``FY5253``, and ``FY5253Quarter`` DateOffsets (:issue:`4511`)
+  - Added ``mode()`` method to ``Series`` and ``DataFrame`` to get the
+    statistical mode(s) of a column/series. (:issue:`5367`)
 
 Experimental Features
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -81,6 +81,8 @@ API changes
   ``SparsePanel``, etc.), now support the entire set of arithmetic operators
   and arithmetic flex methods (add, sub, mul, etc.). ``SparsePanel`` does not
   support ``pow`` or ``mod`` with non-scalars. (:issue:`3765`)
+- ``Series`` and ``DataFrame`` now have a ``mode()`` method to calculate the
+  statistical mode(s) by axis/Series. (:issue:`5367`)
 
 
 Prior Version Deprecations/Changes
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -221,6 +221,34 @@ def value_counts(values, sort=True, ascending=False, normalize=False, bins=None)
     return result
 
 
+def mode(values):
+    from pandas.core.series import Series
+
+    if isinstance(values, Series):
+        constructor = values._constructor
+        values = values.values
+    else:
+        values = np.asanyarray(values)
+        constructor = Series
+
+    dtype = values.dtype
+    if com.is_integer_dtype(values.dtype):
+        values = com._ensure_int64(values)
+        result = constructor(htable.mode_int64(values), dtype=dtype)
+
+    elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):
+        dtype = values.dtype
+        values = values.view(np.int64)
+        result = constructor(htable.mode_int64(values), dtype=dtype)
+
+    else:
+        mask = com.isnull(values)
+        values = com._ensure_object(values)
+        result = constructor(htable.mode_object(values, mask), dtype=dtype)
+
+    return result
+
+
 def rank(values, axis=0, method='average', na_option='keep',
          ascending=True):
     """
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4083,6 +4083,24 @@ def _get_agg_axis(self, axis_num):
         else:
             raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)
 
+    def mode(self, axis=0, numeric_only=False):
+        """
+        Gets the mode of each element along the axis selected. Empty if nothing
+        has 2+ occurrences. Adds a row for each mode per label, fills in gaps
+        with nan.
+
+        Parameters
+        ----------
+        axis : {0, 1, 'index', 'columns'} (default 0)
+            0/'index' : get mode of each column
+            1/'columns' : get mode of each row
+        numeric_only : bool, default False
+            if True, only apply to numeric columns
+        """
+        data = self if not numeric_only else self._get_numeric_data()
+        f = lambda s: s.mode()
+        return data.apply(f, axis=axis)
+
     def quantile(self, q=0.5, axis=0, numeric_only=True):
         """
         Return values at the given quantile over requested axis, a la
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1127,6 +1127,20 @@ def value_counts(self, normalize=False, sort=True, ascending=False, bins=None):
         return value_counts(self.values, sort=sort, ascending=ascending,
                             normalize=normalize, bins=bins)
 
+    def mode(self):
+        """Returns the mode(s) of the dataset.
+
+        Empty if nothing occurs at least 2 times.  Always returns Series even
+        if only one value.
+
+        Returns
+        -------
+        modes : Series
+        """
+        # TODO: Add option for bins like value_counts()
+        from pandas.core.algorithms import mode
+        return mode(self)
+
     def unique(self):
         """
         Return array of unique values in the Series. Significantly faster than
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -890,15 +890,12 @@ cdef class Int64Factorizer:
         return labels
 
 
-
-def value_count_int64(ndarray[int64_t] values):
+cdef build_count_table_int64(ndarray[int64_t] values, kh_int64_t *table):
     cdef:
+        int k
         Py_ssize_t i, n = len(values)
-        kh_int64_t *table
         int ret = 0
-        list uniques = []
 
-    table = kh_init_int64()
     kh_resize_int64(table, n)
 
     for i in range(n):
@@ -910,8 +907,17 @@ def value_count_int64(ndarray[int64_t] values):
             k = kh_put_int64(table, val, &ret)
             table.vals[k] = 1
 
-    # for (k = kh_begin(h); k != kh_end(h); ++k)
-    # 	if (kh_exist(h, k)) kh_value(h, k) = 1;
+
+cpdef value_count_int64(ndarray[int64_t] values):
+    cdef:
+        Py_ssize_t i
+        kh_int64_t *table
+        int ret = 0
+        int k
+
+    table = kh_init_int64()
+    build_count_table_int64(values, table)
+
     i = 0
     result_keys = np.empty(table.n_occupied, dtype=np.int64)
     result_counts = np.zeros(table.n_occupied, dtype=np.int64)
@@ -924,15 +930,15 @@ def value_count_int64(ndarray[int64_t] values):
 
     return result_keys, result_counts
 
-def value_count_object(ndarray[object] values,
-                       ndarray[uint8_t, cast=True] mask):
+
+cdef build_count_table_object(ndarray[object] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              kh_pymap_t *table):
     cdef:
+        int k
         Py_ssize_t i, n = len(values)
-        kh_pymap_t *table
         int ret = 0
-        list uniques = []
 
-    table = kh_init_pymap()
     kh_resize_pymap(table, n // 10)
 
     for i in range(n):
@@ -947,6 +953,17 @@ def value_count_object(ndarray[object] values,
             k = kh_put_pymap(table, <PyObject*> val, &ret)
             table.vals[k] = 1
 
+
+cpdef value_count_object(ndarray[object] values,
+                       ndarray[uint8_t, cast=True] mask):
+    cdef:
+        Py_ssize_t i = len(values)
+        kh_pymap_t *table
+        int k
+
+    table = kh_init_pymap()
+    build_count_table_object(values, mask, table)
+
     i = 0
     result_keys = np.empty(table.n_occupied, dtype=object)
     result_counts = np.zeros(table.n_occupied, dtype=np.int64)
@@ -959,3 +976,64 @@ def value_count_object(ndarray[object] values,
 
     return result_keys, result_counts
 
+
+def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask):
+    cdef:
+        int count, max_count = 2
+        int j = -1 # so you can do +=
+        int k
+        Py_ssize_t i, n = len(values)
+        kh_pymap_t *table
+        int ret = 0
+
+    table = kh_init_pymap()
+    build_count_table_object(values, mask, table)
+
+    modes = np.empty(table.n_buckets, dtype=np.object_)
+    for k in range(table.n_buckets):
+        if kh_exist_pymap(table, k):
+            count = table.vals[k]
+
+            if count == max_count:
+                j += 1
+            elif count > max_count:
+                max_count = count
+                j = 0
+            else:
+                continue
+            modes[j] = <object> table.keys[k]
+
+    kh_destroy_pymap(table)
+
+    return modes[:j+1]
+
+
+def mode_int64(ndarray[int64_t] values):
+    cdef:
+        int val, max_val = 2
+        int j = -1 # so you can do +=
+        int k
+        kh_int64_t *table
+        list uniques = []
+
+    table = kh_init_int64()
+
+    build_count_table_int64(values, table)
+
+    modes = np.empty(table.n_buckets, dtype=np.int64)
+    for k in range(table.n_buckets):
+        if kh_exist_int64(table, k):
+            val = table.vals[k]
+
+            if val == max_val:
+                j += 1
+            elif val > max_val:
+                max_val = val
+                j = 0
+            else:
+                continue
+            modes[j] = table.keys[k]
+
+    kh_destroy_int64(table)
+
+    return modes[:j+1]
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -10050,6 +10050,49 @@ def wrapper(x):
             self.assert_(np.isnan(r0).all())
             self.assert_(np.isnan(r1).all())
 
+    def test_mode(self):
+        df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11],
+                           "B": [10, 10, 10, 5, 3, 4],
+                           "C": [8, 8, 8, 9, 9, 9],
+                           "D": range(6)})
+        assert_frame_equal(df[["A"]].mode(),
+                           pd.DataFrame({"A": [12]}))
+        assert_frame_equal(df[["D"]].mode(),
+                           pd.DataFrame(pd.Series([], dtype="int64"),
+                                        columns=["D"]))
+        assert_frame_equal(df[["A", "B"]].mode(),
+                           pd.DataFrame({"A": [12], "B": [10]}))
+        assert_frame_equal(df.mode(),
+                           pd.DataFrame({"A": [12, np.nan],
+                                         "B": [10, np.nan],
+                                         "C": [8, 9],
+                                         "D": [np.nan, np.nan]}))
+
+        # should preserve order
+        df["C"] = list(reversed(df["C"]))
+        assert_frame_equal(df[["A", "B", "C"]].mode(),
+                           pd.DataFrame({"A": [12, np.nan],
+                                         "B": [10, np.nan],
+                                         "C": [9, 8]}))
+        # should work with heterogeneous types
+        df = pd.DataFrame({"A": range(6),
+                           "B": pd.date_range('2011', periods=6),
+                           "C": list('abcdef')})
+        exp = pd.DataFrame({"A": pd.Series([], dtype=df["A"].dtype),
+                            "B": pd.Series([], dtype=df["B"].dtype),
+                            "C": pd.Series([], dtype=df["C"].dtype)})
+        assert_frame_equal(df.mode(), exp)
+
+        # and also when not empty
+        df.loc[1, "A"] = 0
+        df.loc[4, "B"] = df.loc[3, "B"]
+        df.loc[5, "C"] = 'e'
+        exp = pd.DataFrame({"A": pd.Series([0], dtype=df["A"].dtype),
+                            "B": pd.Series([df.loc[3, "B"]], dtype=df["B"].dtype),
+                            "C": pd.Series(['e'], dtype=df["C"].dtype)})
+
+        assert_frame_equal(df.mode(), exp)
+
     def test_sum_corner(self):
         axis0 = self.empty.sum(0)
         axis1 = self.empty.sum(1)
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -19,10 +19,8 @@
 from pandas.core.index import MultiIndex
 from pandas.tseries.index import Timestamp, DatetimeIndex
 import pandas.core.config as cf
-import pandas.core.series as smod
 import pandas.lib as lib
 
-import pandas.core.common as com
 import pandas.core.datetools as datetools
 import pandas.core.nanops as nanops
 
@@ -1721,6 +1719,35 @@ def test_median(self):
         int_ts = Series(np.ones(10, dtype=int), index=lrange(10))
         self.assertAlmostEqual(np.median(int_ts), int_ts.median())
 
+    def test_mode(self):
+        s = Series([12, 12, 11, 10, 19, 11])
+        exp = Series([11, 12])
+        assert_series_equal(s.mode(), exp)
+
+        assert_series_equal(Series([1, 2, 3]).mode(), Series([], dtype=int))
+
+        lst = [5] * 20 + [1] * 10 + [6] * 25
+        np.random.shuffle(lst)
+        s = Series(lst)
+        assert_series_equal(s.mode(), Series([6]))
+
+        s = Series([5] * 10)
+        assert_series_equal(s.mode(), Series([5]))
+
+        s = Series(lst)
+        s[0] = np.nan
+        assert_series_equal(s.mode(), Series([6], dtype=float))
+
+        s = Series(list('adfasbasfwewefwefweeeeasdfasnbam'))
+        assert_series_equal(s.mode(), Series(['e']))
+
+        s = Series(['2011-01-03', '2013-01-02', '1900-05-03'], dtype='M8[ns]')
+        assert_series_equal(s.mode(), Series([], dtype="M8[ns]"))
+        s = Series(['2011-01-03', '2013-01-02', '1900-05-03', '2011-01-03',
+                    '2013-01-02'], dtype='M8[ns]')
+        assert_series_equal(s.mode(), Series(['2011-01-03', '2013-01-02'],
+                                             dtype='M8[ns]'))
+
     def test_prod(self):
         self._check_stat_op('prod', np.prod)