Merge pull request #5380 from jtratner/add-mode-to-series-and-frame

jtratner · jtratner · commit 2d2e8b514602 · 2013-11-04T22:00:57.000-08:00
ENH: Add mode method to Series and DataFrame
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -348,6 +348,7 @@ Computations / Descriptive Stats
    Series.mean
    Series.median
    Series.min
+   Series.mode
    Series.nunique
    Series.pct_change
    Series.prod
@@ -632,6 +633,7 @@ Computations / Descriptive Stats
    DataFrame.mean
    DataFrame.median
    DataFrame.min
+   DataFrame.mode
    DataFrame.pct_change
    DataFrame.prod
    DataFrame.quantile
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -378,6 +378,7 @@ optional ``level`` parameter which applies only if the object has a
     ``median``, Arithmetic median of values
     ``min``, Minimum
     ``max``, Maximum
+    ``mode``, Mode
     ``abs``, Absolute Value
     ``prod``, Product of values
     ``std``, Unbiased standard deviation
@@ -473,8 +474,8 @@ value, ``idxmin`` and ``idxmax`` return the first matching index:
 
 .. _basics.discretization:
 
-Value counts (histogramming)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Value counts (histogramming) / Mode
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The ``value_counts`` Series method and top-level function computes a histogram
 of a 1D array of values. It can also be used as a function on regular arrays:
@@ -487,6 +488,17 @@ of a 1D array of values. It can also be used as a function on regular arrays:
    s.value_counts()
    value_counts(data)
 
+Similarly, you can get the most frequently occuring value(s) (the mode) of the values in a Series or DataFrame:
+
+.. ipython:: python
+
+    data = [1, 1, 3, 3, 3, 5, 5, 7, 7, 7]
+    s = Series(data)
+    s.mode()
+    df = pd.DataFrame({"A": np.random.randint(0, 7, size=50),
+                       "B": np.random.randint(-10, 15, size=50)})
+    df.mode()
+
 
 Discretization and quantiling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -514,6 +526,7 @@ normally distributed data into equal-size quartiles like so:
    value_counts(factor)
 
 We can also pass infinite values to define the bins:
+
 .. ipython:: python
 
    arr = np.random.randn(20)
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -65,6 +65,8 @@ New features
     string via the ``date_format`` keyword (:issue:`4313`)
   - Added ``LastWeekOfMonth`` DateOffset (:issue:`4637`)
   - Added ``FY5253``, and ``FY5253Quarter`` DateOffsets (:issue:`4511`)
+  - Added ``mode()`` method to ``Series`` and ``DataFrame`` to get the
+    statistical mode(s) of a column/series. (:issue:`5367`)
 
 Experimental Features
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -81,6 +81,8 @@ API changes
   ``SparsePanel``, etc.), now support the entire set of arithmetic operators
   and arithmetic flex methods (add, sub, mul, etc.). ``SparsePanel`` does not
   support ``pow`` or ``mod`` with non-scalars. (:issue:`3765`)
+- ``Series`` and ``DataFrame`` now have a ``mode()`` method to calculate the
+  statistical mode(s) by axis/Series. (:issue:`5367`)
 
 
 Prior Version Deprecations/Changes
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -3,6 +3,7 @@
 intended for public consumption
 """
 
+from warnings import warn
 import numpy as np
 
 import pandas.core.common as com
@@ -221,6 +222,41 @@ def value_counts(values, sort=True, ascending=False, normalize=False, bins=None)
     return result
 
 
+def mode(values):
+    "Returns the mode or mode(s) of the passed Series or ndarray (sorted)"
+    # must sort because hash order isn't necessarily defined.
+    from pandas.core.series import Series
+
+    if isinstance(values, Series):
+        constructor = values._constructor
+        values = values.values
+    else:
+        values = np.asanyarray(values)
+        constructor = Series
+
+    dtype = values.dtype
+    if com.is_integer_dtype(values.dtype):
+        values = com._ensure_int64(values)
+        result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)
+
+    elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):
+        dtype = values.dtype
+        values = values.view(np.int64)
+        result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)
+
+    else:
+        mask = com.isnull(values)
+        values = com._ensure_object(values)
+        res = htable.mode_object(values, mask)
+        try:
+            res = sorted(res)
+        except TypeError as e:
+            warn("Unable to sort modes: %s" % e)
+        result = constructor(res, dtype=dtype)
+
+    return result
+
+
 def rank(values, axis=0, method='average', na_option='keep',
          ascending=True):
     """
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4083,6 +4083,28 @@ def _get_agg_axis(self, axis_num):
         else:
             raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)
 
+    def mode(self, axis=0, numeric_only=False):
+        """
+        Gets the mode of each element along the axis selected. Empty if nothing
+        has 2+ occurrences. Adds a row for each mode per label, fills in gaps
+        with nan.
+
+        Parameters
+        ----------
+        axis : {0, 1, 'index', 'columns'} (default 0)
+            0/'index' : get mode of each column
+            1/'columns' : get mode of each row
+        numeric_only : bool, default False
+            if True, only apply to numeric columns
+
+        Returns
+        -------
+        modes : DataFrame (sorted)
+        """
+        data = self if not numeric_only else self._get_numeric_data()
+        f = lambda s: s.mode()
+        return data.apply(f, axis=axis)
+
     def quantile(self, q=0.5, axis=0, numeric_only=True):
         """
         Return values at the given quantile over requested axis, a la
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1127,6 +1127,26 @@ def value_counts(self, normalize=False, sort=True, ascending=False, bins=None):
         return value_counts(self.values, sort=sort, ascending=ascending,
                             normalize=normalize, bins=bins)
 
+    def mode(self):
+        """Returns the mode(s) of the dataset.
+
+        Empty if nothing occurs at least 2 times.  Always returns Series even
+        if only one value.
+
+        Parameters
+        ----------
+        sort : bool, default True
+            if True, will lexicographically sort values, if False skips
+            sorting. Result ordering when ``sort=False`` is not defined.
+
+        Returns
+        -------
+        modes : Series (sorted)
+        """
+        # TODO: Add option for bins like value_counts()
+        from pandas.core.algorithms import mode
+        return mode(self)
+
     def unique(self):
         """
         Return array of unique values in the Series. Significantly faster than
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -890,15 +890,12 @@ cdef class Int64Factorizer:
         return labels
 
 
-
-def value_count_int64(ndarray[int64_t] values):
+cdef build_count_table_int64(ndarray[int64_t] values, kh_int64_t *table):
     cdef:
+        int k
         Py_ssize_t i, n = len(values)
-        kh_int64_t *table
         int ret = 0
-        list uniques = []
 
-    table = kh_init_int64()
     kh_resize_int64(table, n)
 
     for i in range(n):
@@ -910,8 +907,17 @@ def value_count_int64(ndarray[int64_t] values):
             k = kh_put_int64(table, val, &ret)
             table.vals[k] = 1
 
-    # for (k = kh_begin(h); k != kh_end(h); ++k)
-    # 	if (kh_exist(h, k)) kh_value(h, k) = 1;
+
+cpdef value_count_int64(ndarray[int64_t] values):
+    cdef:
+        Py_ssize_t i
+        kh_int64_t *table
+        int ret = 0
+        int k
+
+    table = kh_init_int64()
+    build_count_table_int64(values, table)
+
     i = 0
     result_keys = np.empty(table.n_occupied, dtype=np.int64)
     result_counts = np.zeros(table.n_occupied, dtype=np.int64)
@@ -924,15 +930,15 @@ def value_count_int64(ndarray[int64_t] values):
 
     return result_keys, result_counts
 
-def value_count_object(ndarray[object] values,
-                       ndarray[uint8_t, cast=True] mask):
+
+cdef build_count_table_object(ndarray[object] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              kh_pymap_t *table):
     cdef:
+        int k
         Py_ssize_t i, n = len(values)
-        kh_pymap_t *table
         int ret = 0
-        list uniques = []
 
-    table = kh_init_pymap()
     kh_resize_pymap(table, n // 10)
 
     for i in range(n):
@@ -947,6 +953,17 @@ def value_count_object(ndarray[object] values,
             k = kh_put_pymap(table, <PyObject*> val, &ret)
             table.vals[k] = 1
 
+
+cpdef value_count_object(ndarray[object] values,
+                       ndarray[uint8_t, cast=True] mask):
+    cdef:
+        Py_ssize_t i = len(values)
+        kh_pymap_t *table
+        int k
+
+    table = kh_init_pymap()
+    build_count_table_object(values, mask, table)
+
     i = 0
     result_keys = np.empty(table.n_occupied, dtype=object)
     result_counts = np.zeros(table.n_occupied, dtype=np.int64)
@@ -959,3 +976,64 @@ def value_count_object(ndarray[object] values,
 
     return result_keys, result_counts
 
+
+def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask):
+    cdef:
+        int count, max_count = 2
+        int j = -1 # so you can do +=
+        int k
+        Py_ssize_t i, n = len(values)
+        kh_pymap_t *table
+        int ret = 0
+
+    table = kh_init_pymap()
+    build_count_table_object(values, mask, table)
+
+    modes = np.empty(table.n_buckets, dtype=np.object_)
+    for k in range(table.n_buckets):
+        if kh_exist_pymap(table, k):
+            count = table.vals[k]
+
+            if count == max_count:
+                j += 1
+            elif count > max_count:
+                max_count = count
+                j = 0
+            else:
+                continue
+            modes[j] = <object> table.keys[k]
+
+    kh_destroy_pymap(table)
+
+    return modes[:j+1]
+
+
+def mode_int64(ndarray[int64_t] values):
+    cdef:
+        int val, max_val = 2
+        int j = -1 # so you can do +=
+        int k
+        kh_int64_t *table
+        list uniques = []
+
+    table = kh_init_int64()
+
+    build_count_table_int64(values, table)
+
+    modes = np.empty(table.n_buckets, dtype=np.int64)
+    for k in range(table.n_buckets):
+        if kh_exist_int64(table, k):
+            val = table.vals[k]
+
+            if val == max_val:
+                j += 1
+            elif val > max_val:
+                max_val = val
+                j = 0
+            else:
+                continue
+            modes[j] = table.keys[k]
+
+    kh_destroy_int64(table)
+
+    return modes[:j+1]
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -10050,6 +10050,59 @@ def wrapper(x):
             self.assert_(np.isnan(r0).all())
             self.assert_(np.isnan(r1).all())
 
+    def test_mode(self):
+        df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11],
+                           "B": [10, 10, 10, np.nan, 3, 4],
+                           "C": [8, 8, 8, 9, 9, 9],
+                           "D": range(6),
+                           "E": [8, 8, 1, 1, 3, 3]})
+        assert_frame_equal(df[["A"]].mode(),
+                           pd.DataFrame({"A": [12]}))
+        assert_frame_equal(df[["D"]].mode(),
+                           pd.DataFrame(pd.Series([], dtype="int64"),
+                                        columns=["D"]))
+        assert_frame_equal(df[["E"]].mode(),
+                           pd.DataFrame(pd.Series([1, 3, 8], dtype="int64"),
+                                        columns=["E"]))
+        assert_frame_equal(df[["A", "B"]].mode(),
+                           pd.DataFrame({"A": [12], "B": [10.]}))
+        assert_frame_equal(df.mode(),
+                           pd.DataFrame({"A": [12, np.nan, np.nan],
+                                         "B": [10, np.nan, np.nan],
+                                         "C": [8, 9, np.nan],
+                                         "D": [np.nan, np.nan, np.nan],
+                                         "E": [1, 3, 8]}))
+
+        # outputs in sorted order
+        df["C"] = list(reversed(df["C"]))
+        print(df["C"])
+        print(df["C"].mode())
+        a, b = (df[["A", "B", "C"]].mode(),
+                           pd.DataFrame({"A": [12, np.nan],
+                                         "B": [10, np.nan],
+                                         "C": [8, 9]}))
+        print(a)
+        print(b)
+        assert_frame_equal(a, b)
+        # should work with heterogeneous types
+        df = pd.DataFrame({"A": range(6),
+                           "B": pd.date_range('2011', periods=6),
+                           "C": list('abcdef')})
+        exp = pd.DataFrame({"A": pd.Series([], dtype=df["A"].dtype),
+                            "B": pd.Series([], dtype=df["B"].dtype),
+                            "C": pd.Series([], dtype=df["C"].dtype)})
+        assert_frame_equal(df.mode(), exp)
+
+        # and also when not empty
+        df.loc[1, "A"] = 0
+        df.loc[4, "B"] = df.loc[3, "B"]
+        df.loc[5, "C"] = 'e'
+        exp = pd.DataFrame({"A": pd.Series([0], dtype=df["A"].dtype),
+                            "B": pd.Series([df.loc[3, "B"]], dtype=df["B"].dtype),
+                            "C": pd.Series(['e'], dtype=df["C"].dtype)})
+
+        assert_frame_equal(df.mode(), exp)
+
     def test_sum_corner(self):
         axis0 = self.empty.sum(0)
         axis1 = self.empty.sum(1)
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py