diff --git a/doc/source/api.rst b/doc/source/api.rst index 20bfe037f7373..c2c2bc8710af2 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -348,6 +348,7 @@ Computations / Descriptive Stats Series.mean Series.median Series.min + Series.mode Series.nunique Series.pct_change Series.prod @@ -632,6 +633,7 @@ Computations / Descriptive Stats DataFrame.mean DataFrame.median DataFrame.min + DataFrame.mode DataFrame.pct_change DataFrame.prod DataFrame.quantile diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 78b0a54b8893f..19f293561efb7 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -378,6 +378,7 @@ optional ``level`` parameter which applies only if the object has a ``median``, Arithmetic median of values ``min``, Minimum ``max``, Maximum + ``mode``, Mode ``abs``, Absolute Value ``prod``, Product of values ``std``, Unbiased standard deviation @@ -473,8 +474,8 @@ value, ``idxmin`` and ``idxmax`` return the first matching index: .. _basics.discretization: -Value counts (histogramming) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Value counts (histogramming) / Mode +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The ``value_counts`` Series method and top-level function computes a histogram of a 1D array of values. It can also be used as a function on regular arrays: @@ -487,6 +488,17 @@ of a 1D array of values. It can also be used as a function on regular arrays: s.value_counts() value_counts(data) +Similarly, you can get the most frequently occuring value(s) (the mode) of the values in a Series or DataFrame: + +.. ipython:: python + + data = [1, 1, 3, 3, 3, 5, 5, 7, 7, 7] + s = Series(data) + s.mode() + df = pd.DataFrame({"A": np.random.randint(0, 7, size=50), + "B": np.random.randint(-10, 15, size=50)}) + df.mode() + Discretization and quantiling ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -514,6 +526,7 @@ normally distributed data into equal-size quartiles like so: value_counts(factor) We can also pass infinite values to define the bins: + .. ipython:: python arr = np.random.randn(20) diff --git a/doc/source/release.rst b/doc/source/release.rst index 4b33c20424b33..5fef061b9e447 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -65,6 +65,8 @@ New features string via the ``date_format`` keyword (:issue:`4313`) - Added ``LastWeekOfMonth`` DateOffset (:issue:`4637`) - Added ``FY5253``, and ``FY5253Quarter`` DateOffsets (:issue:`4511`) + - Added ``mode()`` method to ``Series`` and ``DataFrame`` to get the + statistical mode(s) of a column/series. (:issue:`5367`) Experimental Features ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 27794d6af1a8d..9b958d59d5a74 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -81,6 +81,8 @@ API changes ``SparsePanel``, etc.), now support the entire set of arithmetic operators and arithmetic flex methods (add, sub, mul, etc.). ``SparsePanel`` does not support ``pow`` or ``mod`` with non-scalars. (:issue:`3765`) +- ``Series`` and ``DataFrame`` now have a ``mode()`` method to calculate the + statistical mode(s) by axis/Series. (:issue:`5367`) Prior Version Deprecations/Changes diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5778a524a584a..6ab6f15aaab15 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -3,6 +3,7 @@ intended for public consumption """ +from warnings import warn import numpy as np import pandas.core.common as com @@ -221,6 +222,41 @@ def value_counts(values, sort=True, ascending=False, normalize=False, bins=None) return result +def mode(values): + "Returns the mode or mode(s) of the passed Series or ndarray (sorted)" + # must sort because hash order isn't necessarily defined. + from pandas.core.series import Series + + if isinstance(values, Series): + constructor = values._constructor + values = values.values + else: + values = np.asanyarray(values) + constructor = Series + + dtype = values.dtype + if com.is_integer_dtype(values.dtype): + values = com._ensure_int64(values) + result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) + + elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)): + dtype = values.dtype + values = values.view(np.int64) + result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) + + else: + mask = com.isnull(values) + values = com._ensure_object(values) + res = htable.mode_object(values, mask) + try: + res = sorted(res) + except TypeError as e: + warn("Unable to sort modes: %s" % e) + result = constructor(res, dtype=dtype) + + return result + + def rank(values, axis=0, method='average', na_option='keep', ascending=True): """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0a5306de9bbb5..1cb0c4adcf5d4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4083,6 +4083,28 @@ def _get_agg_axis(self, axis_num): else: raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num) + def mode(self, axis=0, numeric_only=False): + """ + Gets the mode of each element along the axis selected. Empty if nothing + has 2+ occurrences. Adds a row for each mode per label, fills in gaps + with nan. + + Parameters + ---------- + axis : {0, 1, 'index', 'columns'} (default 0) + 0/'index' : get mode of each column + 1/'columns' : get mode of each row + numeric_only : bool, default False + if True, only apply to numeric columns + + Returns + ------- + modes : DataFrame (sorted) + """ + data = self if not numeric_only else self._get_numeric_data() + f = lambda s: s.mode() + return data.apply(f, axis=axis) + def quantile(self, q=0.5, axis=0, numeric_only=True): """ Return values at the given quantile over requested axis, a la diff --git a/pandas/core/series.py b/pandas/core/series.py index 798183a29c48b..3fe9540ba3fe0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1127,6 +1127,26 @@ def value_counts(self, normalize=False, sort=True, ascending=False, bins=None): return value_counts(self.values, sort=sort, ascending=ascending, normalize=normalize, bins=bins) + def mode(self): + """Returns the mode(s) of the dataset. + + Empty if nothing occurs at least 2 times. Always returns Series even + if only one value. + + Parameters + ---------- + sort : bool, default True + if True, will lexicographically sort values, if False skips + sorting. Result ordering when ``sort=False`` is not defined. + + Returns + ------- + modes : Series (sorted) + """ + # TODO: Add option for bins like value_counts() + from pandas.core.algorithms import mode + return mode(self) + def unique(self): """ Return array of unique values in the Series. Significantly faster than diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 1b132ea91f515..10c43478a5352 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -890,15 +890,12 @@ cdef class Int64Factorizer: return labels - -def value_count_int64(ndarray[int64_t] values): +cdef build_count_table_int64(ndarray[int64_t] values, kh_int64_t *table): cdef: + int k Py_ssize_t i, n = len(values) - kh_int64_t *table int ret = 0 - list uniques = [] - table = kh_init_int64() kh_resize_int64(table, n) for i in range(n): @@ -910,8 +907,17 @@ def value_count_int64(ndarray[int64_t] values): k = kh_put_int64(table, val, &ret) table.vals[k] = 1 - # for (k = kh_begin(h); k != kh_end(h); ++k) - # if (kh_exist(h, k)) kh_value(h, k) = 1; + +cpdef value_count_int64(ndarray[int64_t] values): + cdef: + Py_ssize_t i + kh_int64_t *table + int ret = 0 + int k + + table = kh_init_int64() + build_count_table_int64(values, table) + i = 0 result_keys = np.empty(table.n_occupied, dtype=np.int64) result_counts = np.zeros(table.n_occupied, dtype=np.int64) @@ -924,15 +930,15 @@ def value_count_int64(ndarray[int64_t] values): return result_keys, result_counts -def value_count_object(ndarray[object] values, - ndarray[uint8_t, cast=True] mask): + +cdef build_count_table_object(ndarray[object] values, + ndarray[uint8_t, cast=True] mask, + kh_pymap_t *table): cdef: + int k Py_ssize_t i, n = len(values) - kh_pymap_t *table int ret = 0 - list uniques = [] - table = kh_init_pymap() kh_resize_pymap(table, n // 10) for i in range(n): @@ -947,6 +953,17 @@ def value_count_object(ndarray[object] values, k = kh_put_pymap(table, val, &ret) table.vals[k] = 1 + +cpdef value_count_object(ndarray[object] values, + ndarray[uint8_t, cast=True] mask): + cdef: + Py_ssize_t i = len(values) + kh_pymap_t *table + int k + + table = kh_init_pymap() + build_count_table_object(values, mask, table) + i = 0 result_keys = np.empty(table.n_occupied, dtype=object) result_counts = np.zeros(table.n_occupied, dtype=np.int64) @@ -959,3 +976,64 @@ def value_count_object(ndarray[object] values, return result_keys, result_counts + +def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): + cdef: + int count, max_count = 2 + int j = -1 # so you can do += + int k + Py_ssize_t i, n = len(values) + kh_pymap_t *table + int ret = 0 + + table = kh_init_pymap() + build_count_table_object(values, mask, table) + + modes = np.empty(table.n_buckets, dtype=np.object_) + for k in range(table.n_buckets): + if kh_exist_pymap(table, k): + count = table.vals[k] + + if count == max_count: + j += 1 + elif count > max_count: + max_count = count + j = 0 + else: + continue + modes[j] = table.keys[k] + + kh_destroy_pymap(table) + + return modes[:j+1] + + +def mode_int64(ndarray[int64_t] values): + cdef: + int val, max_val = 2 + int j = -1 # so you can do += + int k + kh_int64_t *table + list uniques = [] + + table = kh_init_int64() + + build_count_table_int64(values, table) + + modes = np.empty(table.n_buckets, dtype=np.int64) + for k in range(table.n_buckets): + if kh_exist_int64(table, k): + val = table.vals[k] + + if val == max_val: + j += 1 + elif val > max_val: + max_val = val + j = 0 + else: + continue + modes[j] = table.keys[k] + + kh_destroy_int64(table) + + return modes[:j+1] diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 2f7696f2520fc..e0abe7700f28d 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10050,6 +10050,59 @@ def wrapper(x): self.assert_(np.isnan(r0).all()) self.assert_(np.isnan(r1).all()) + def test_mode(self): + df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11], + "B": [10, 10, 10, np.nan, 3, 4], + "C": [8, 8, 8, 9, 9, 9], + "D": range(6), + "E": [8, 8, 1, 1, 3, 3]}) + assert_frame_equal(df[["A"]].mode(), + pd.DataFrame({"A": [12]})) + assert_frame_equal(df[["D"]].mode(), + pd.DataFrame(pd.Series([], dtype="int64"), + columns=["D"])) + assert_frame_equal(df[["E"]].mode(), + pd.DataFrame(pd.Series([1, 3, 8], dtype="int64"), + columns=["E"])) + assert_frame_equal(df[["A", "B"]].mode(), + pd.DataFrame({"A": [12], "B": [10.]})) + assert_frame_equal(df.mode(), + pd.DataFrame({"A": [12, np.nan, np.nan], + "B": [10, np.nan, np.nan], + "C": [8, 9, np.nan], + "D": [np.nan, np.nan, np.nan], + "E": [1, 3, 8]})) + + # outputs in sorted order + df["C"] = list(reversed(df["C"])) + print(df["C"]) + print(df["C"].mode()) + a, b = (df[["A", "B", "C"]].mode(), + pd.DataFrame({"A": [12, np.nan], + "B": [10, np.nan], + "C": [8, 9]})) + print(a) + print(b) + assert_frame_equal(a, b) + # should work with heterogeneous types + df = pd.DataFrame({"A": range(6), + "B": pd.date_range('2011', periods=6), + "C": list('abcdef')}) + exp = pd.DataFrame({"A": pd.Series([], dtype=df["A"].dtype), + "B": pd.Series([], dtype=df["B"].dtype), + "C": pd.Series([], dtype=df["C"].dtype)}) + assert_frame_equal(df.mode(), exp) + + # and also when not empty + df.loc[1, "A"] = 0 + df.loc[4, "B"] = df.loc[3, "B"] + df.loc[5, "C"] = 'e' + exp = pd.DataFrame({"A": pd.Series([0], dtype=df["A"].dtype), + "B": pd.Series([df.loc[3, "B"]], dtype=df["B"].dtype), + "C": pd.Series(['e'], dtype=df["C"].dtype)}) + + assert_frame_equal(df.mode(), exp) + def test_sum_corner(self): axis0 = self.empty.sum(0) axis1 = self.empty.sum(1) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index e5186cb3030ff..91fa1f1a19ffc 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -19,10 +19,8 @@ from pandas.core.index import MultiIndex from pandas.tseries.index import Timestamp, DatetimeIndex import pandas.core.config as cf -import pandas.core.series as smod import pandas.lib as lib -import pandas.core.common as com import pandas.core.datetools as datetools import pandas.core.nanops as nanops @@ -1721,6 +1719,35 @@ def test_median(self): int_ts = Series(np.ones(10, dtype=int), index=lrange(10)) self.assertAlmostEqual(np.median(int_ts), int_ts.median()) + def test_mode(self): + s = Series([12, 12, 11, 10, 19, 11]) + exp = Series([11, 12]) + assert_series_equal(s.mode(), exp) + + assert_series_equal(Series([1, 2, 3]).mode(), Series([], dtype=int)) + + lst = [5] * 20 + [1] * 10 + [6] * 25 + np.random.shuffle(lst) + s = Series(lst) + assert_series_equal(s.mode(), Series([6])) + + s = Series([5] * 10) + assert_series_equal(s.mode(), Series([5])) + + s = Series(lst) + s[0] = np.nan + assert_series_equal(s.mode(), Series([6], dtype=float)) + + s = Series(list('adfasbasfwewefwefweeeeasdfasnbam')) + assert_series_equal(s.mode(), Series(['e'])) + + s = Series(['2011-01-03', '2013-01-02', '1900-05-03'], dtype='M8[ns]') + assert_series_equal(s.mode(), Series([], dtype="M8[ns]")) + s = Series(['2011-01-03', '2013-01-02', '1900-05-03', '2011-01-03', + '2013-01-02'], dtype='M8[ns]') + assert_series_equal(s.mode(), Series(['2011-01-03', '2013-01-02'], + dtype='M8[ns]')) + def test_prod(self): self._check_stat_op('prod', np.prod)