Skip to content

Commit 2d2e8b5

Browse files
committed
Merge pull request #5380 from jtratner/add-mode-to-series-and-frame
ENH: Add mode method to Series and DataFrame
2 parents 4d8cd16 + 0d1e383 commit 2d2e8b5

File tree

10 files changed

+271
-16
lines changed

10 files changed

+271
-16
lines changed

doc/source/api.rst

+2
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,7 @@ Computations / Descriptive Stats
348348
Series.mean
349349
Series.median
350350
Series.min
351+
Series.mode
351352
Series.nunique
352353
Series.pct_change
353354
Series.prod
@@ -632,6 +633,7 @@ Computations / Descriptive Stats
632633
DataFrame.mean
633634
DataFrame.median
634635
DataFrame.min
636+
DataFrame.mode
635637
DataFrame.pct_change
636638
DataFrame.prod
637639
DataFrame.quantile

doc/source/basics.rst

+15-2
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,7 @@ optional ``level`` parameter which applies only if the object has a
378378
``median``, Arithmetic median of values
379379
``min``, Minimum
380380
``max``, Maximum
381+
``mode``, Mode
381382
``abs``, Absolute Value
382383
``prod``, Product of values
383384
``std``, Unbiased standard deviation
@@ -473,8 +474,8 @@ value, ``idxmin`` and ``idxmax`` return the first matching index:
473474

474475
.. _basics.discretization:
475476

476-
Value counts (histogramming)
477-
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
477+
Value counts (histogramming) / Mode
478+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
478479

479480
The ``value_counts`` Series method and top-level function computes a histogram
480481
of a 1D array of values. It can also be used as a function on regular arrays:
@@ -487,6 +488,17 @@ of a 1D array of values. It can also be used as a function on regular arrays:
487488
s.value_counts()
488489
value_counts(data)
489490
491+
Similarly, you can get the most frequently occuring value(s) (the mode) of the values in a Series or DataFrame:
492+
493+
.. ipython:: python
494+
495+
data = [1, 1, 3, 3, 3, 5, 5, 7, 7, 7]
496+
s = Series(data)
497+
s.mode()
498+
df = pd.DataFrame({"A": np.random.randint(0, 7, size=50),
499+
"B": np.random.randint(-10, 15, size=50)})
500+
df.mode()
501+
490502
491503
Discretization and quantiling
492504
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -514,6 +526,7 @@ normally distributed data into equal-size quartiles like so:
514526
value_counts(factor)
515527
516528
We can also pass infinite values to define the bins:
529+
517530
.. ipython:: python
518531
519532
arr = np.random.randn(20)

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ New features
6565
string via the ``date_format`` keyword (:issue:`4313`)
6666
- Added ``LastWeekOfMonth`` DateOffset (:issue:`4637`)
6767
- Added ``FY5253``, and ``FY5253Quarter`` DateOffsets (:issue:`4511`)
68+
- Added ``mode()`` method to ``Series`` and ``DataFrame`` to get the
69+
statistical mode(s) of a column/series. (:issue:`5367`)
6870

6971
Experimental Features
7072
~~~~~~~~~~~~~~~~~~~~~

doc/source/v0.13.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ API changes
8181
``SparsePanel``, etc.), now support the entire set of arithmetic operators
8282
and arithmetic flex methods (add, sub, mul, etc.). ``SparsePanel`` does not
8383
support ``pow`` or ``mod`` with non-scalars. (:issue:`3765`)
84+
- ``Series`` and ``DataFrame`` now have a ``mode()`` method to calculate the
85+
statistical mode(s) by axis/Series. (:issue:`5367`)
8486

8587

8688
Prior Version Deprecations/Changes

pandas/core/algorithms.py

+36
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
intended for public consumption
44
"""
55

6+
from warnings import warn
67
import numpy as np
78

89
import pandas.core.common as com
@@ -221,6 +222,41 @@ def value_counts(values, sort=True, ascending=False, normalize=False, bins=None)
221222
return result
222223

223224

225+
def mode(values):
226+
"Returns the mode or mode(s) of the passed Series or ndarray (sorted)"
227+
# must sort because hash order isn't necessarily defined.
228+
from pandas.core.series import Series
229+
230+
if isinstance(values, Series):
231+
constructor = values._constructor
232+
values = values.values
233+
else:
234+
values = np.asanyarray(values)
235+
constructor = Series
236+
237+
dtype = values.dtype
238+
if com.is_integer_dtype(values.dtype):
239+
values = com._ensure_int64(values)
240+
result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)
241+
242+
elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):
243+
dtype = values.dtype
244+
values = values.view(np.int64)
245+
result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)
246+
247+
else:
248+
mask = com.isnull(values)
249+
values = com._ensure_object(values)
250+
res = htable.mode_object(values, mask)
251+
try:
252+
res = sorted(res)
253+
except TypeError as e:
254+
warn("Unable to sort modes: %s" % e)
255+
result = constructor(res, dtype=dtype)
256+
257+
return result
258+
259+
224260
def rank(values, axis=0, method='average', na_option='keep',
225261
ascending=True):
226262
"""

pandas/core/frame.py

+22
Original file line numberDiff line numberDiff line change
@@ -4083,6 +4083,28 @@ def _get_agg_axis(self, axis_num):
40834083
else:
40844084
raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)
40854085

4086+
def mode(self, axis=0, numeric_only=False):
4087+
"""
4088+
Gets the mode of each element along the axis selected. Empty if nothing
4089+
has 2+ occurrences. Adds a row for each mode per label, fills in gaps
4090+
with nan.
4091+
4092+
Parameters
4093+
----------
4094+
axis : {0, 1, 'index', 'columns'} (default 0)
4095+
0/'index' : get mode of each column
4096+
1/'columns' : get mode of each row
4097+
numeric_only : bool, default False
4098+
if True, only apply to numeric columns
4099+
4100+
Returns
4101+
-------
4102+
modes : DataFrame (sorted)
4103+
"""
4104+
data = self if not numeric_only else self._get_numeric_data()
4105+
f = lambda s: s.mode()
4106+
return data.apply(f, axis=axis)
4107+
40864108
def quantile(self, q=0.5, axis=0, numeric_only=True):
40874109
"""
40884110
Return values at the given quantile over requested axis, a la

pandas/core/series.py

+20
Original file line numberDiff line numberDiff line change
@@ -1127,6 +1127,26 @@ def value_counts(self, normalize=False, sort=True, ascending=False, bins=None):
11271127
return value_counts(self.values, sort=sort, ascending=ascending,
11281128
normalize=normalize, bins=bins)
11291129

1130+
def mode(self):
1131+
"""Returns the mode(s) of the dataset.
1132+
1133+
Empty if nothing occurs at least 2 times. Always returns Series even
1134+
if only one value.
1135+
1136+
Parameters
1137+
----------
1138+
sort : bool, default True
1139+
if True, will lexicographically sort values, if False skips
1140+
sorting. Result ordering when ``sort=False`` is not defined.
1141+
1142+
Returns
1143+
-------
1144+
modes : Series (sorted)
1145+
"""
1146+
# TODO: Add option for bins like value_counts()
1147+
from pandas.core.algorithms import mode
1148+
return mode(self)
1149+
11301150
def unique(self):
11311151
"""
11321152
Return array of unique values in the Series. Significantly faster than

pandas/hashtable.pyx

+90-12
Original file line numberDiff line numberDiff line change
@@ -890,15 +890,12 @@ cdef class Int64Factorizer:
890890
return labels
891891

892892

893-
894-
def value_count_int64(ndarray[int64_t] values):
893+
cdef build_count_table_int64(ndarray[int64_t] values, kh_int64_t *table):
895894
cdef:
895+
int k
896896
Py_ssize_t i, n = len(values)
897-
kh_int64_t *table
898897
int ret = 0
899-
list uniques = []
900898

901-
table = kh_init_int64()
902899
kh_resize_int64(table, n)
903900

904901
for i in range(n):
@@ -910,8 +907,17 @@ def value_count_int64(ndarray[int64_t] values):
910907
k = kh_put_int64(table, val, &ret)
911908
table.vals[k] = 1
912909

913-
# for (k = kh_begin(h); k != kh_end(h); ++k)
914-
# if (kh_exist(h, k)) kh_value(h, k) = 1;
910+
911+
cpdef value_count_int64(ndarray[int64_t] values):
912+
cdef:
913+
Py_ssize_t i
914+
kh_int64_t *table
915+
int ret = 0
916+
int k
917+
918+
table = kh_init_int64()
919+
build_count_table_int64(values, table)
920+
915921
i = 0
916922
result_keys = np.empty(table.n_occupied, dtype=np.int64)
917923
result_counts = np.zeros(table.n_occupied, dtype=np.int64)
@@ -924,15 +930,15 @@ def value_count_int64(ndarray[int64_t] values):
924930

925931
return result_keys, result_counts
926932

927-
def value_count_object(ndarray[object] values,
928-
ndarray[uint8_t, cast=True] mask):
933+
934+
cdef build_count_table_object(ndarray[object] values,
935+
ndarray[uint8_t, cast=True] mask,
936+
kh_pymap_t *table):
929937
cdef:
938+
int k
930939
Py_ssize_t i, n = len(values)
931-
kh_pymap_t *table
932940
int ret = 0
933-
list uniques = []
934941

935-
table = kh_init_pymap()
936942
kh_resize_pymap(table, n // 10)
937943

938944
for i in range(n):
@@ -947,6 +953,17 @@ def value_count_object(ndarray[object] values,
947953
k = kh_put_pymap(table, <PyObject*> val, &ret)
948954
table.vals[k] = 1
949955

956+
957+
cpdef value_count_object(ndarray[object] values,
958+
ndarray[uint8_t, cast=True] mask):
959+
cdef:
960+
Py_ssize_t i = len(values)
961+
kh_pymap_t *table
962+
int k
963+
964+
table = kh_init_pymap()
965+
build_count_table_object(values, mask, table)
966+
950967
i = 0
951968
result_keys = np.empty(table.n_occupied, dtype=object)
952969
result_counts = np.zeros(table.n_occupied, dtype=np.int64)
@@ -959,3 +976,64 @@ def value_count_object(ndarray[object] values,
959976

960977
return result_keys, result_counts
961978

979+
980+
def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask):
981+
cdef:
982+
int count, max_count = 2
983+
int j = -1 # so you can do +=
984+
int k
985+
Py_ssize_t i, n = len(values)
986+
kh_pymap_t *table
987+
int ret = 0
988+
989+
table = kh_init_pymap()
990+
build_count_table_object(values, mask, table)
991+
992+
modes = np.empty(table.n_buckets, dtype=np.object_)
993+
for k in range(table.n_buckets):
994+
if kh_exist_pymap(table, k):
995+
count = table.vals[k]
996+
997+
if count == max_count:
998+
j += 1
999+
elif count > max_count:
1000+
max_count = count
1001+
j = 0
1002+
else:
1003+
continue
1004+
modes[j] = <object> table.keys[k]
1005+
1006+
kh_destroy_pymap(table)
1007+
1008+
return modes[:j+1]
1009+
1010+
1011+
def mode_int64(ndarray[int64_t] values):
1012+
cdef:
1013+
int val, max_val = 2
1014+
int j = -1 # so you can do +=
1015+
int k
1016+
kh_int64_t *table
1017+
list uniques = []
1018+
1019+
table = kh_init_int64()
1020+
1021+
build_count_table_int64(values, table)
1022+
1023+
modes = np.empty(table.n_buckets, dtype=np.int64)
1024+
for k in range(table.n_buckets):
1025+
if kh_exist_int64(table, k):
1026+
val = table.vals[k]
1027+
1028+
if val == max_val:
1029+
j += 1
1030+
elif val > max_val:
1031+
max_val = val
1032+
j = 0
1033+
else:
1034+
continue
1035+
modes[j] = table.keys[k]
1036+
1037+
kh_destroy_int64(table)
1038+
1039+
return modes[:j+1]

pandas/tests/test_frame.py

+53
Original file line numberDiff line numberDiff line change
@@ -10050,6 +10050,59 @@ def wrapper(x):
1005010050
self.assert_(np.isnan(r0).all())
1005110051
self.assert_(np.isnan(r1).all())
1005210052

10053+
def test_mode(self):
10054+
df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11],
10055+
"B": [10, 10, 10, np.nan, 3, 4],
10056+
"C": [8, 8, 8, 9, 9, 9],
10057+
"D": range(6),
10058+
"E": [8, 8, 1, 1, 3, 3]})
10059+
assert_frame_equal(df[["A"]].mode(),
10060+
pd.DataFrame({"A": [12]}))
10061+
assert_frame_equal(df[["D"]].mode(),
10062+
pd.DataFrame(pd.Series([], dtype="int64"),
10063+
columns=["D"]))
10064+
assert_frame_equal(df[["E"]].mode(),
10065+
pd.DataFrame(pd.Series([1, 3, 8], dtype="int64"),
10066+
columns=["E"]))
10067+
assert_frame_equal(df[["A", "B"]].mode(),
10068+
pd.DataFrame({"A": [12], "B": [10.]}))
10069+
assert_frame_equal(df.mode(),
10070+
pd.DataFrame({"A": [12, np.nan, np.nan],
10071+
"B": [10, np.nan, np.nan],
10072+
"C": [8, 9, np.nan],
10073+
"D": [np.nan, np.nan, np.nan],
10074+
"E": [1, 3, 8]}))
10075+
10076+
# outputs in sorted order
10077+
df["C"] = list(reversed(df["C"]))
10078+
print(df["C"])
10079+
print(df["C"].mode())
10080+
a, b = (df[["A", "B", "C"]].mode(),
10081+
pd.DataFrame({"A": [12, np.nan],
10082+
"B": [10, np.nan],
10083+
"C": [8, 9]}))
10084+
print(a)
10085+
print(b)
10086+
assert_frame_equal(a, b)
10087+
# should work with heterogeneous types
10088+
df = pd.DataFrame({"A": range(6),
10089+
"B": pd.date_range('2011', periods=6),
10090+
"C": list('abcdef')})
10091+
exp = pd.DataFrame({"A": pd.Series([], dtype=df["A"].dtype),
10092+
"B": pd.Series([], dtype=df["B"].dtype),
10093+
"C": pd.Series([], dtype=df["C"].dtype)})
10094+
assert_frame_equal(df.mode(), exp)
10095+
10096+
# and also when not empty
10097+
df.loc[1, "A"] = 0
10098+
df.loc[4, "B"] = df.loc[3, "B"]
10099+
df.loc[5, "C"] = 'e'
10100+
exp = pd.DataFrame({"A": pd.Series([0], dtype=df["A"].dtype),
10101+
"B": pd.Series([df.loc[3, "B"]], dtype=df["B"].dtype),
10102+
"C": pd.Series(['e'], dtype=df["C"].dtype)})
10103+
10104+
assert_frame_equal(df.mode(), exp)
10105+
1005310106
def test_sum_corner(self):
1005410107
axis0 = self.empty.sum(0)
1005510108
axis1 = self.empty.sum(1)

0 commit comments

Comments
 (0)