Skip to content

ENH: Add mode method to Series and DataFrame #5380

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 5, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ Computations / Descriptive Stats
Series.mean
Series.median
Series.min
Series.mode
Series.nunique
Series.pct_change
Series.prod
Expand Down Expand Up @@ -632,6 +633,7 @@ Computations / Descriptive Stats
DataFrame.mean
DataFrame.median
DataFrame.min
DataFrame.mode
DataFrame.pct_change
DataFrame.prod
DataFrame.quantile
Expand Down
17 changes: 15 additions & 2 deletions doc/source/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,7 @@ optional ``level`` parameter which applies only if the object has a
``median``, Arithmetic median of values
``min``, Minimum
``max``, Maximum
``mode``, Mode
``abs``, Absolute Value
``prod``, Product of values
``std``, Unbiased standard deviation
Expand Down Expand Up @@ -473,8 +474,8 @@ value, ``idxmin`` and ``idxmax`` return the first matching index:

.. _basics.discretization:

Value counts (histogramming)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Value counts (histogramming) / Mode
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The ``value_counts`` Series method and top-level function computes a histogram
of a 1D array of values. It can also be used as a function on regular arrays:
Expand All @@ -487,6 +488,17 @@ of a 1D array of values. It can also be used as a function on regular arrays:
s.value_counts()
value_counts(data)

Similarly, you can get the most frequently occuring value(s) (the mode) of the values in a Series or DataFrame:

.. ipython:: python

data = [1, 1, 3, 3, 3, 5, 5, 7, 7, 7]
s = Series(data)
s.mode()
df = pd.DataFrame({"A": np.random.randint(0, 7, size=50),
"B": np.random.randint(-10, 15, size=50)})
df.mode()


Discretization and quantiling
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -514,6 +526,7 @@ normally distributed data into equal-size quartiles like so:
value_counts(factor)

We can also pass infinite values to define the bins:

.. ipython:: python

arr = np.random.randn(20)
Expand Down
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ New features
string via the ``date_format`` keyword (:issue:`4313`)
- Added ``LastWeekOfMonth`` DateOffset (:issue:`4637`)
- Added ``FY5253``, and ``FY5253Quarter`` DateOffsets (:issue:`4511`)
- Added ``mode()`` method to ``Series`` and ``DataFrame`` to get the
statistical mode(s) of a column/series. (:issue:`5367`)

Experimental Features
~~~~~~~~~~~~~~~~~~~~~
Expand Down
2 changes: 2 additions & 0 deletions doc/source/v0.13.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ API changes
``SparsePanel``, etc.), now support the entire set of arithmetic operators
and arithmetic flex methods (add, sub, mul, etc.). ``SparsePanel`` does not
support ``pow`` or ``mod`` with non-scalars. (:issue:`3765`)
- ``Series`` and ``DataFrame`` now have a ``mode()`` method to calculate the
statistical mode(s) by axis/Series. (:issue:`5367`)


Prior Version Deprecations/Changes
Expand Down
36 changes: 36 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
intended for public consumption
"""

from warnings import warn
import numpy as np

import pandas.core.common as com
Expand Down Expand Up @@ -221,6 +222,41 @@ def value_counts(values, sort=True, ascending=False, normalize=False, bins=None)
return result


def mode(values):
"Returns the mode or mode(s) of the passed Series or ndarray (sorted)"
# must sort because hash order isn't necessarily defined.
from pandas.core.series import Series

if isinstance(values, Series):
constructor = values._constructor
values = values.values
else:
values = np.asanyarray(values)
constructor = Series

dtype = values.dtype
if com.is_integer_dtype(values.dtype):
values = com._ensure_int64(values)
result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)

elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):
dtype = values.dtype
values = values.view(np.int64)
result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)

else:
mask = com.isnull(values)
values = com._ensure_object(values)
res = htable.mode_object(values, mask)
try:
res = sorted(res)
except TypeError as e:
warn("Unable to sort modes: %s" % e)
result = constructor(res, dtype=dtype)

return result


def rank(values, axis=0, method='average', na_option='keep',
ascending=True):
"""
Expand Down
22 changes: 22 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4083,6 +4083,28 @@ def _get_agg_axis(self, axis_num):
else:
raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)

def mode(self, axis=0, numeric_only=False):
"""
Gets the mode of each element along the axis selected. Empty if nothing
has 2+ occurrences. Adds a row for each mode per label, fills in gaps
with nan.

Parameters
----------
axis : {0, 1, 'index', 'columns'} (default 0)
0/'index' : get mode of each column
1/'columns' : get mode of each row
numeric_only : bool, default False
if True, only apply to numeric columns

Returns
-------
modes : DataFrame (sorted)
"""
data = self if not numeric_only else self._get_numeric_data()
f = lambda s: s.mode()
return data.apply(f, axis=axis)

def quantile(self, q=0.5, axis=0, numeric_only=True):
"""
Return values at the given quantile over requested axis, a la
Expand Down
20 changes: 20 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1127,6 +1127,26 @@ def value_counts(self, normalize=False, sort=True, ascending=False, bins=None):
return value_counts(self.values, sort=sort, ascending=ascending,
normalize=normalize, bins=bins)

def mode(self):
"""Returns the mode(s) of the dataset.

Empty if nothing occurs at least 2 times. Always returns Series even
if only one value.

Parameters
----------
sort : bool, default True
if True, will lexicographically sort values, if False skips
sorting. Result ordering when ``sort=False`` is not defined.

Returns
-------
modes : Series (sorted)
"""
# TODO: Add option for bins like value_counts()
from pandas.core.algorithms import mode
return mode(self)

def unique(self):
"""
Return array of unique values in the Series. Significantly faster than
Expand Down
102 changes: 90 additions & 12 deletions pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -890,15 +890,12 @@ cdef class Int64Factorizer:
return labels



def value_count_int64(ndarray[int64_t] values):
cdef build_count_table_int64(ndarray[int64_t] values, kh_int64_t *table):
cdef:
int k
Py_ssize_t i, n = len(values)
kh_int64_t *table
int ret = 0
list uniques = []

table = kh_init_int64()
kh_resize_int64(table, n)

for i in range(n):
Expand All @@ -910,8 +907,17 @@ def value_count_int64(ndarray[int64_t] values):
k = kh_put_int64(table, val, &ret)
table.vals[k] = 1

# for (k = kh_begin(h); k != kh_end(h); ++k)
# if (kh_exist(h, k)) kh_value(h, k) = 1;

cpdef value_count_int64(ndarray[int64_t] values):
cdef:
Py_ssize_t i
kh_int64_t *table
int ret = 0
int k

table = kh_init_int64()
build_count_table_int64(values, table)

i = 0
result_keys = np.empty(table.n_occupied, dtype=np.int64)
result_counts = np.zeros(table.n_occupied, dtype=np.int64)
Expand All @@ -924,15 +930,15 @@ def value_count_int64(ndarray[int64_t] values):

return result_keys, result_counts

def value_count_object(ndarray[object] values,
ndarray[uint8_t, cast=True] mask):

cdef build_count_table_object(ndarray[object] values,
ndarray[uint8_t, cast=True] mask,
kh_pymap_t *table):
cdef:
int k
Py_ssize_t i, n = len(values)
kh_pymap_t *table
int ret = 0
list uniques = []

table = kh_init_pymap()
kh_resize_pymap(table, n // 10)

for i in range(n):
Expand All @@ -947,6 +953,17 @@ def value_count_object(ndarray[object] values,
k = kh_put_pymap(table, <PyObject*> val, &ret)
table.vals[k] = 1


cpdef value_count_object(ndarray[object] values,
ndarray[uint8_t, cast=True] mask):
cdef:
Py_ssize_t i = len(values)
kh_pymap_t *table
int k

table = kh_init_pymap()
build_count_table_object(values, mask, table)

i = 0
result_keys = np.empty(table.n_occupied, dtype=object)
result_counts = np.zeros(table.n_occupied, dtype=np.int64)
Expand All @@ -959,3 +976,64 @@ def value_count_object(ndarray[object] values,

return result_keys, result_counts


def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask):
cdef:
int count, max_count = 2
int j = -1 # so you can do +=
int k
Py_ssize_t i, n = len(values)
kh_pymap_t *table
int ret = 0

table = kh_init_pymap()
build_count_table_object(values, mask, table)

modes = np.empty(table.n_buckets, dtype=np.object_)
for k in range(table.n_buckets):
if kh_exist_pymap(table, k):
count = table.vals[k]

if count == max_count:
j += 1
elif count > max_count:
max_count = count
j = 0
else:
continue
modes[j] = <object> table.keys[k]

kh_destroy_pymap(table)

return modes[:j+1]


def mode_int64(ndarray[int64_t] values):
cdef:
int val, max_val = 2
int j = -1 # so you can do +=
int k
kh_int64_t *table
list uniques = []

table = kh_init_int64()

build_count_table_int64(values, table)

modes = np.empty(table.n_buckets, dtype=np.int64)
for k in range(table.n_buckets):
if kh_exist_int64(table, k):
val = table.vals[k]

if val == max_val:
j += 1
elif val > max_val:
max_val = val
j = 0
else:
continue
modes[j] = table.keys[k]

kh_destroy_int64(table)

return modes[:j+1]
53 changes: 53 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10050,6 +10050,59 @@ def wrapper(x):
self.assert_(np.isnan(r0).all())
self.assert_(np.isnan(r1).all())

def test_mode(self):
df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11],
"B": [10, 10, 10, np.nan, 3, 4],
"C": [8, 8, 8, 9, 9, 9],
"D": range(6),
"E": [8, 8, 1, 1, 3, 3]})
assert_frame_equal(df[["A"]].mode(),
pd.DataFrame({"A": [12]}))
assert_frame_equal(df[["D"]].mode(),
pd.DataFrame(pd.Series([], dtype="int64"),
columns=["D"]))
assert_frame_equal(df[["E"]].mode(),
pd.DataFrame(pd.Series([1, 3, 8], dtype="int64"),
columns=["E"]))
assert_frame_equal(df[["A", "B"]].mode(),
pd.DataFrame({"A": [12], "B": [10.]}))
assert_frame_equal(df.mode(),
pd.DataFrame({"A": [12, np.nan, np.nan],
"B": [10, np.nan, np.nan],
"C": [8, 9, np.nan],
"D": [np.nan, np.nan, np.nan],
"E": [1, 3, 8]}))

# outputs in sorted order
df["C"] = list(reversed(df["C"]))
print(df["C"])
print(df["C"].mode())
a, b = (df[["A", "B", "C"]].mode(),
pd.DataFrame({"A": [12, np.nan],
"B": [10, np.nan],
"C": [8, 9]}))
print(a)
print(b)
assert_frame_equal(a, b)
# should work with heterogeneous types
df = pd.DataFrame({"A": range(6),
"B": pd.date_range('2011', periods=6),
"C": list('abcdef')})
exp = pd.DataFrame({"A": pd.Series([], dtype=df["A"].dtype),
"B": pd.Series([], dtype=df["B"].dtype),
"C": pd.Series([], dtype=df["C"].dtype)})
assert_frame_equal(df.mode(), exp)

# and also when not empty
df.loc[1, "A"] = 0
df.loc[4, "B"] = df.loc[3, "B"]
df.loc[5, "C"] = 'e'
exp = pd.DataFrame({"A": pd.Series([0], dtype=df["A"].dtype),
"B": pd.Series([df.loc[3, "B"]], dtype=df["B"].dtype),
"C": pd.Series(['e'], dtype=df["C"].dtype)})

assert_frame_equal(df.mode(), exp)

def test_sum_corner(self):
axis0 = self.empty.sum(0)
axis1 = self.empty.sum(1)
Expand Down
Loading