Skip to content

Commit 7841d9a

Browse files
committed
ENH: Add mode method to Series and DataFrame
Abstract building up the tables with counts so value_count and mode can share. Aww! DOC: Add documentation about mode()
1 parent 4d8cd16 commit 7841d9a

File tree

10 files changed

+243
-16
lines changed

10 files changed

+243
-16
lines changed

doc/source/api.rst

+2
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,7 @@ Computations / Descriptive Stats
348348
Series.mean
349349
Series.median
350350
Series.min
351+
Series.mode
351352
Series.nunique
352353
Series.pct_change
353354
Series.prod
@@ -632,6 +633,7 @@ Computations / Descriptive Stats
632633
DataFrame.mean
633634
DataFrame.median
634635
DataFrame.min
636+
DataFrame.mode
635637
DataFrame.pct_change
636638
DataFrame.prod
637639
DataFrame.quantile

doc/source/basics.rst

+15-2
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,7 @@ optional ``level`` parameter which applies only if the object has a
378378
``median``, Arithmetic median of values
379379
``min``, Minimum
380380
``max``, Maximum
381+
``mode``, Mode
381382
``abs``, Absolute Value
382383
``prod``, Product of values
383384
``std``, Unbiased standard deviation
@@ -473,8 +474,8 @@ value, ``idxmin`` and ``idxmax`` return the first matching index:
473474

474475
.. _basics.discretization:
475476

476-
Value counts (histogramming)
477-
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
477+
Value counts (histogramming) / Mode
478+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
478479

479480
The ``value_counts`` Series method and top-level function computes a histogram
480481
of a 1D array of values. It can also be used as a function on regular arrays:
@@ -487,6 +488,17 @@ of a 1D array of values. It can also be used as a function on regular arrays:
487488
s.value_counts()
488489
value_counts(data)
489490
491+
Similarly, you can get the most frequently occuring value(s) (the mode) of the values in a Series or DataFrame:
492+
493+
.. ipython:: python
494+
495+
data = [1, 1, 3, 3, 3, 5, 5, 7, 7, 7]
496+
s = Series(data)
497+
s.mode()
498+
df = pd.DataFrame({"A": np.random.randint(0, 7, size=50),
499+
"B": np.random.randint(-10, 15, size=50)})
500+
df.mode()
501+
490502
491503
Discretization and quantiling
492504
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -514,6 +526,7 @@ normally distributed data into equal-size quartiles like so:
514526
value_counts(factor)
515527
516528
We can also pass infinite values to define the bins:
529+
517530
.. ipython:: python
518531
519532
arr = np.random.randn(20)

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ New features
6565
string via the ``date_format`` keyword (:issue:`4313`)
6666
- Added ``LastWeekOfMonth`` DateOffset (:issue:`4637`)
6767
- Added ``FY5253``, and ``FY5253Quarter`` DateOffsets (:issue:`4511`)
68+
- Added ``mode()`` method to ``Series`` and ``DataFrame`` to get the
69+
statistical mode(s) of a column/series. (:issue:`5367`)
6870

6971
Experimental Features
7072
~~~~~~~~~~~~~~~~~~~~~

doc/source/v0.13.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ API changes
8181
``SparsePanel``, etc.), now support the entire set of arithmetic operators
8282
and arithmetic flex methods (add, sub, mul, etc.). ``SparsePanel`` does not
8383
support ``pow`` or ``mod`` with non-scalars. (:issue:`3765`)
84+
- ``Series`` and ``DataFrame`` now have a ``mode()`` method to calculate the
85+
statistical mode(s) by axis/Series. (:issue:`5367`)
8486

8587

8688
Prior Version Deprecations/Changes

pandas/core/algorithms.py

+28
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,34 @@ def value_counts(values, sort=True, ascending=False, normalize=False, bins=None)
221221
return result
222222

223223

224+
def mode(values):
225+
from pandas.core.series import Series
226+
227+
if isinstance(values, Series):
228+
constructor = values._constructor
229+
values = values.values
230+
else:
231+
values = np.asanyarray(values)
232+
constructor = Series
233+
234+
dtype = values.dtype
235+
if com.is_integer_dtype(values.dtype):
236+
values = com._ensure_int64(values)
237+
result = constructor(htable.mode_int64(values), dtype=dtype)
238+
239+
elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):
240+
dtype = values.dtype
241+
values = values.view(np.int64)
242+
result = constructor(htable.mode_int64(values), dtype=dtype)
243+
244+
else:
245+
mask = com.isnull(values)
246+
values = com._ensure_object(values)
247+
result = constructor(htable.mode_object(values, mask), dtype=dtype)
248+
249+
return result
250+
251+
224252
def rank(values, axis=0, method='average', na_option='keep',
225253
ascending=True):
226254
"""

pandas/core/frame.py

+18
Original file line numberDiff line numberDiff line change
@@ -4083,6 +4083,24 @@ def _get_agg_axis(self, axis_num):
40834083
else:
40844084
raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)
40854085

4086+
def mode(self, axis=0, numeric_only=False):
4087+
"""
4088+
Gets the mode of each element along the axis selected. Empty if nothing
4089+
has 2+ occurrences. Adds a row for each mode per label, fills in gaps
4090+
with nan.
4091+
4092+
Parameters
4093+
----------
4094+
axis : {0, 1, 'index', 'columns'} (default 0)
4095+
0/'index' : get mode of each column
4096+
1/'columns' : get mode of each row
4097+
numeric_only : bool, default False
4098+
if True, only apply to numeric columns
4099+
"""
4100+
data = self if not numeric_only else self._get_numeric_data()
4101+
f = lambda s: s.mode()
4102+
return data.apply(f, axis=axis)
4103+
40864104
def quantile(self, q=0.5, axis=0, numeric_only=True):
40874105
"""
40884106
Return values at the given quantile over requested axis, a la

pandas/core/series.py

+14
Original file line numberDiff line numberDiff line change
@@ -1127,6 +1127,20 @@ def value_counts(self, normalize=False, sort=True, ascending=False, bins=None):
11271127
return value_counts(self.values, sort=sort, ascending=ascending,
11281128
normalize=normalize, bins=bins)
11291129

1130+
def mode(self):
1131+
"""Returns the mode(s) of the dataset.
1132+
1133+
Empty if nothing occurs at least 2 times. Always returns Series even
1134+
if only one value.
1135+
1136+
Returns
1137+
-------
1138+
modes : Series
1139+
"""
1140+
# TODO: Add option for bins like value_counts()
1141+
from pandas.core.algorithms import mode
1142+
return mode(self)
1143+
11301144
def unique(self):
11311145
"""
11321146
Return array of unique values in the Series. Significantly faster than

pandas/hashtable.pyx

+90-12
Original file line numberDiff line numberDiff line change
@@ -890,15 +890,12 @@ cdef class Int64Factorizer:
890890
return labels
891891

892892

893-
894-
def value_count_int64(ndarray[int64_t] values):
893+
cdef build_count_table_int64(ndarray[int64_t] values, kh_int64_t *table):
895894
cdef:
895+
int k
896896
Py_ssize_t i, n = len(values)
897-
kh_int64_t *table
898897
int ret = 0
899-
list uniques = []
900898

901-
table = kh_init_int64()
902899
kh_resize_int64(table, n)
903900

904901
for i in range(n):
@@ -910,8 +907,17 @@ def value_count_int64(ndarray[int64_t] values):
910907
k = kh_put_int64(table, val, &ret)
911908
table.vals[k] = 1
912909

913-
# for (k = kh_begin(h); k != kh_end(h); ++k)
914-
# if (kh_exist(h, k)) kh_value(h, k) = 1;
910+
911+
cpdef value_count_int64(ndarray[int64_t] values):
912+
cdef:
913+
Py_ssize_t i
914+
kh_int64_t *table
915+
int ret = 0
916+
int k
917+
918+
table = kh_init_int64()
919+
build_count_table_int64(values, table)
920+
915921
i = 0
916922
result_keys = np.empty(table.n_occupied, dtype=np.int64)
917923
result_counts = np.zeros(table.n_occupied, dtype=np.int64)
@@ -924,15 +930,15 @@ def value_count_int64(ndarray[int64_t] values):
924930

925931
return result_keys, result_counts
926932

927-
def value_count_object(ndarray[object] values,
928-
ndarray[uint8_t, cast=True] mask):
933+
934+
cdef build_count_table_object(ndarray[object] values,
935+
ndarray[uint8_t, cast=True] mask,
936+
kh_pymap_t *table):
929937
cdef:
938+
int k
930939
Py_ssize_t i, n = len(values)
931-
kh_pymap_t *table
932940
int ret = 0
933-
list uniques = []
934941

935-
table = kh_init_pymap()
936942
kh_resize_pymap(table, n // 10)
937943

938944
for i in range(n):
@@ -947,6 +953,17 @@ def value_count_object(ndarray[object] values,
947953
k = kh_put_pymap(table, <PyObject*> val, &ret)
948954
table.vals[k] = 1
949955

956+
957+
cpdef value_count_object(ndarray[object] values,
958+
ndarray[uint8_t, cast=True] mask):
959+
cdef:
960+
Py_ssize_t i = len(values)
961+
kh_pymap_t *table
962+
int k
963+
964+
table = kh_init_pymap()
965+
build_count_table_object(values, mask, table)
966+
950967
i = 0
951968
result_keys = np.empty(table.n_occupied, dtype=object)
952969
result_counts = np.zeros(table.n_occupied, dtype=np.int64)
@@ -959,3 +976,64 @@ def value_count_object(ndarray[object] values,
959976

960977
return result_keys, result_counts
961978

979+
980+
def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask):
981+
cdef:
982+
int count, max_count = 2
983+
int j = -1 # so you can do +=
984+
int k
985+
Py_ssize_t i, n = len(values)
986+
kh_pymap_t *table
987+
int ret = 0
988+
989+
table = kh_init_pymap()
990+
build_count_table_object(values, mask, table)
991+
992+
modes = np.empty(table.n_buckets, dtype=np.object_)
993+
for k in range(table.n_buckets):
994+
if kh_exist_pymap(table, k):
995+
count = table.vals[k]
996+
997+
if count == max_count:
998+
j += 1
999+
elif count > max_count:
1000+
max_count = count
1001+
j = 0
1002+
else:
1003+
continue
1004+
modes[j] = <object> table.keys[k]
1005+
1006+
kh_destroy_pymap(table)
1007+
1008+
return modes[:j+1]
1009+
1010+
1011+
def mode_int64(ndarray[int64_t] values):
1012+
cdef:
1013+
int val, max_val = 2
1014+
int j = -1 # so you can do +=
1015+
int k
1016+
kh_int64_t *table
1017+
list uniques = []
1018+
1019+
table = kh_init_int64()
1020+
1021+
build_count_table_int64(values, table)
1022+
1023+
modes = np.empty(table.n_buckets, dtype=np.int64)
1024+
for k in range(table.n_buckets):
1025+
if kh_exist_int64(table, k):
1026+
val = table.vals[k]
1027+
1028+
if val == max_val:
1029+
j += 1
1030+
elif val > max_val:
1031+
max_val = val
1032+
j = 0
1033+
else:
1034+
continue
1035+
modes[j] = table.keys[k]
1036+
1037+
kh_destroy_int64(table)
1038+
1039+
return modes[:j+1]

pandas/tests/test_frame.py

+43
Original file line numberDiff line numberDiff line change
@@ -10050,6 +10050,49 @@ def wrapper(x):
1005010050
self.assert_(np.isnan(r0).all())
1005110051
self.assert_(np.isnan(r1).all())
1005210052

10053+
def test_mode(self):
10054+
df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11],
10055+
"B": [10, 10, 10, 5, 3, 4],
10056+
"C": [8, 8, 8, 9, 9, 9],
10057+
"D": range(6)})
10058+
assert_frame_equal(df[["A"]].mode(),
10059+
pd.DataFrame({"A": [12]}))
10060+
assert_frame_equal(df[["D"]].mode(),
10061+
pd.DataFrame(pd.Series([], dtype="int64"),
10062+
columns=["D"]))
10063+
assert_frame_equal(df[["A", "B"]].mode(),
10064+
pd.DataFrame({"A": [12], "B": [10]}))
10065+
assert_frame_equal(df.mode(),
10066+
pd.DataFrame({"A": [12, np.nan],
10067+
"B": [10, np.nan],
10068+
"C": [8, 9],
10069+
"D": [np.nan, np.nan]}))
10070+
10071+
# should preserve order
10072+
df["C"] = list(reversed(df["C"]))
10073+
assert_frame_equal(df[["A", "B", "C"]].mode(),
10074+
pd.DataFrame({"A": [12, np.nan],
10075+
"B": [10, np.nan],
10076+
"C": [9, 8]}))
10077+
# should work with heterogeneous types
10078+
df = pd.DataFrame({"A": range(6),
10079+
"B": pd.date_range('2011', periods=6),
10080+
"C": list('abcdef')})
10081+
exp = pd.DataFrame({"A": pd.Series([], dtype=df["A"].dtype),
10082+
"B": pd.Series([], dtype=df["B"].dtype),
10083+
"C": pd.Series([], dtype=df["C"].dtype)})
10084+
assert_frame_equal(df.mode(), exp)
10085+
10086+
# and also when not empty
10087+
df.loc[1, "A"] = 0
10088+
df.loc[4, "B"] = df.loc[3, "B"]
10089+
df.loc[5, "C"] = 'e'
10090+
exp = pd.DataFrame({"A": pd.Series([0], dtype=df["A"].dtype),
10091+
"B": pd.Series([df.loc[3, "B"]], dtype=df["B"].dtype),
10092+
"C": pd.Series(['e'], dtype=df["C"].dtype)})
10093+
10094+
assert_frame_equal(df.mode(), exp)
10095+
1005310096
def test_sum_corner(self):
1005410097
axis0 = self.empty.sum(0)
1005510098
axis1 = self.empty.sum(1)

pandas/tests/test_series.py

+29-2
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,8 @@
1919
from pandas.core.index import MultiIndex
2020
from pandas.tseries.index import Timestamp, DatetimeIndex
2121
import pandas.core.config as cf
22-
import pandas.core.series as smod
2322
import pandas.lib as lib
2423

25-
import pandas.core.common as com
2624
import pandas.core.datetools as datetools
2725
import pandas.core.nanops as nanops
2826

@@ -1721,6 +1719,35 @@ def test_median(self):
17211719
int_ts = Series(np.ones(10, dtype=int), index=lrange(10))
17221720
self.assertAlmostEqual(np.median(int_ts), int_ts.median())
17231721

1722+
def test_mode(self):
1723+
s = Series([12, 12, 11, 10, 19, 11])
1724+
exp = Series([11, 12])
1725+
assert_series_equal(s.mode(), exp)
1726+
1727+
assert_series_equal(Series([1, 2, 3]).mode(), Series([], dtype=int))
1728+
1729+
lst = [5] * 20 + [1] * 10 + [6] * 25
1730+
np.random.shuffle(lst)
1731+
s = Series(lst)
1732+
assert_series_equal(s.mode(), Series([6]))
1733+
1734+
s = Series([5] * 10)
1735+
assert_series_equal(s.mode(), Series([5]))
1736+
1737+
s = Series(lst)
1738+
s[0] = np.nan
1739+
assert_series_equal(s.mode(), Series([6], dtype=float))
1740+
1741+
s = Series(list('adfasbasfwewefwefweeeeasdfasnbam'))
1742+
assert_series_equal(s.mode(), Series(['e']))
1743+
1744+
s = Series(['2011-01-03', '2013-01-02', '1900-05-03'], dtype='M8[ns]')
1745+
assert_series_equal(s.mode(), Series([], dtype="M8[ns]"))
1746+
s = Series(['2011-01-03', '2013-01-02', '1900-05-03', '2011-01-03',
1747+
'2013-01-02'], dtype='M8[ns]')
1748+
assert_series_equal(s.mode(), Series(['2011-01-03', '2013-01-02'],
1749+
dtype='M8[ns]'))
1750+
17241751
def test_prod(self):
17251752
self._check_stat_op('prod', np.prod)
17261753

0 commit comments

Comments
 (0)