Skip to content

Commit 036a4c1

Browse files
committed
ENH: mode(dropna=False) (pandas-dev#17534)
1 parent 3a2e9e6 commit 036a4c1

File tree

7 files changed

+89
-25
lines changed

7 files changed

+89
-25
lines changed

pandas/_libs/hashtable_func_helper.pxi.in

+6-7
Original file line numberDiff line numberDiff line change
@@ -288,25 +288,25 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
288288
{{py:
289289

290290
# dtype, ctype, table_type, npy_dtype
291-
dtypes = [('int64', 'int64_t', 'int64', 'int64'),
291+
dtypes = [('float64', 'float64_t', 'float64', 'float64'),
292+
('int64', 'int64_t', 'int64', 'int64'),
292293
('uint64', 'uint64_t', 'uint64', 'uint64'),
293294
('object', 'object', 'pymap', 'object_')]
294295
}}
295296

296297
{{for dtype, ctype, table_type, npy_dtype in dtypes}}
297298

298-
299299
@cython.wraparound(False)
300300
@cython.boundscheck(False)
301301

302302
{{if dtype == 'object'}}
303303

304304

305-
def mode_{{dtype}}(ndarray[{{ctype}}] values):
305+
def mode_{{dtype}}(ndarray[{{ctype}}] values, bint dropna):
306306
{{else}}
307307

308308

309-
def mode_{{dtype}}({{ctype}}[:] values):
309+
def mode_{{dtype}}({{ctype}}[:] values, bint dropna):
310310
{{endif}}
311311
cdef:
312312
int count, max_count = 1
@@ -317,9 +317,9 @@ def mode_{{dtype}}({{ctype}}[:] values):
317317

318318
table = kh_init_{{table_type}}()
319319
{{if dtype == 'object'}}
320-
build_count_table_{{dtype}}(values, table, 1)
320+
build_count_table_{{dtype}}(values, table, dropna)
321321
{{else}}
322-
build_count_table_{{dtype}}(values, table, 0)
322+
build_count_table_{{dtype}}(values, table, dropna)
323323
{{endif}}
324324

325325
modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}})
@@ -329,7 +329,6 @@ def mode_{{dtype}}({{ctype}}[:] values):
329329
for k in range(table.n_buckets):
330330
if kh_exist_{{table_type}}(table, k):
331331
count = table.vals[k]
332-
333332
if count == max_count:
334333
j += 1
335334
elif count > max_count:

pandas/core/algorithms.py

+6-10
Original file line numberDiff line numberDiff line change
@@ -791,14 +791,16 @@ def duplicated(values, keep='first'):
791791
return f(values, keep=keep)
792792

793793

794-
def mode(values):
794+
def mode(values, dropna=True):
795795
"""
796796
Returns the mode(s) of an array.
797797
798798
Parameters
799799
----------
800800
values : array-like
801801
Array over which to check for duplicate values.
802+
dropna : boolean, default True
803+
Don't include counts of NaN.
802804
803805
Returns
804806
-------
@@ -811,20 +813,14 @@ def mode(values):
811813

812814
# categorical is a fast-path
813815
if is_categorical_dtype(values):
814-
815816
if isinstance(values, Series):
816-
return Series(values.values.mode(), name=values.name)
817-
return values.mode()
817+
return Series(values.values.mode(dropna=dropna), name=values.name)
818+
return values.mode(dropna=dropna)
818819

819820
values, dtype, ndtype = _ensure_data(values)
820821

821-
# TODO: this should support float64
822-
if ndtype not in ['int64', 'uint64', 'object']:
823-
ndtype = 'object'
824-
values = _ensure_object(values)
825-
826822
f = getattr(htable, "mode_{dtype}".format(dtype=ndtype))
827-
result = f(values)
823+
result = f(values, dropna=dropna)
828824
try:
829825
result = np.sort(result)
830826
except TypeError as e:

pandas/core/arrays/categorical.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -2044,20 +2044,28 @@ def max(self, numeric_only=None, **kwargs):
20442044
else:
20452045
return self.categories[pointer]
20462046

2047-
def mode(self):
2047+
def mode(self, dropna=True):
20482048
"""
20492049
Returns the mode(s) of the Categorical.
20502050
20512051
Always returns `Categorical` even if only one value.
20522052
2053+
Parameters
2054+
----------
2055+
dropna : boolean, default True
2056+
Don't include counts of NaN.
2057+
20532058
Returns
20542059
-------
20552060
modes : `Categorical` (sorted)
20562061
"""
20572062

20582063
import pandas._libs.hashtable as htable
2059-
good = self._codes != -1
2060-
values = sorted(htable.mode_int64(_ensure_int64(self._codes[good])))
2064+
vals = self._codes
2065+
if dropna:
2066+
good = self._codes != -1
2067+
vals = self._codes[good]
2068+
values = sorted(htable.mode_int64(_ensure_int64(vals), dropna))
20612069
result = self._constructor(values=values, categories=self.categories,
20622070
ordered=self.ordered, fastpath=True)
20632071
return result

pandas/core/frame.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -6992,7 +6992,7 @@ def _get_agg_axis(self, axis_num):
69926992
else:
69936993
raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)
69946994

6995-
def mode(self, axis=0, numeric_only=False):
6995+
def mode(self, axis=0, numeric_only=False, dropna=True):
69966996
"""
69976997
Gets the mode(s) of each element along the axis selected. Adds a row
69986998
for each mode per label, fills in gaps with nan.
@@ -7010,6 +7010,8 @@ def mode(self, axis=0, numeric_only=False):
70107010
* 1 or 'columns' : get mode of each row
70117011
numeric_only : boolean, default False
70127012
if True, only apply to numeric columns
7013+
dropna : boolean, default True
7014+
Don't include counts of NaN.
70137015
70147016
Returns
70157017
-------
@@ -7026,7 +7028,7 @@ def mode(self, axis=0, numeric_only=False):
70267028
data = self if not numeric_only else self._get_numeric_data()
70277029

70287030
def f(s):
7029-
return s.mode()
7031+
return s.mode(dropna=dropna)
70307032

70317033
return data.apply(f, axis=axis)
70327034

pandas/core/series.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -1420,17 +1420,24 @@ def count(self, level=None):
14201420
return self._constructor(out, index=lev,
14211421
dtype='int64').__finalize__(self)
14221422

1423-
def mode(self):
1423+
def mode(self, dropna=True):
14241424
"""Return the mode(s) of the dataset.
14251425
14261426
Always returns Series even if only one value is returned.
14271427
1428+
Parameters
1429+
-------
1430+
dropna : boolean, default True
1431+
Don't include counts of NaN.
1432+
14281433
Returns
14291434
-------
14301435
modes : Series (sorted)
1436+
1437+
14311438
"""
14321439
# TODO: Add option for bins like value_counts()
1433-
return algorithms.mode(self)
1440+
return algorithms.mode(self, dropna=dropna)
14341441

14351442
def unique(self):
14361443
"""

pandas/tests/frame/test_analytics.py

+31
Original file line numberDiff line numberDiff line change
@@ -889,6 +889,37 @@ def test_mode(self):
889889
dtype=df["C"].dtype)})
890890
tm.assert_frame_equal(df.mode(), exp)
891891

892+
def test_mode_dropna(self):
893+
# GH 17534
894+
# Test the dropna=False parameter for mode
895+
896+
df = pd.DataFrame({"A": [1, np.nan, np.nan, np.nan],
897+
"B": [np.nan, np.nan, 'a', np.nan],
898+
"C": Categorical([np.nan, np.nan, 'a', np.nan]),
899+
"D": pd.to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
900+
"E": pd.to_timedelta(['1 days', 'nan', 'nan', 'nan']),
901+
"F": [1, 1, np.nan, np.nan],
902+
"G": [np.nan, np.nan, 'a', 'a'],
903+
"H": Categorical(['a', np.nan, 'a', np.nan]),
904+
"I": pd.to_datetime(['2000-1-2', '2000-1-2', 'NaT', 'NaT']),
905+
"J": pd.to_timedelta(['1 days', 'nan', '1 days', 'nan'])})
906+
907+
result = df.loc[:, 'A':'E'].mode(dropna=False)
908+
expected = pd.DataFrame({'A': [np.nan],
909+
'B': np.array([np.nan], dtype=object),
910+
'C': Categorical([np.nan], categories=['a']),
911+
'D': [pd.NaT],
912+
'E': pd.to_timedelta([pd.NaT])})
913+
tm.assert_frame_equal(result, expected)
914+
915+
result = df.loc[:, 'F':'J'].mode(dropna=False)
916+
expected = pd.DataFrame({'F': [1, np.nan],
917+
'G': [np.nan, 'a'],
918+
'H': Categorical([np.nan, 'a'], categories=['a']),
919+
'I': pd.to_datetime(['NaT', '2000-1-2']),
920+
'J': pd.to_timedelta(['nan', '1 days'])})
921+
tm.assert_frame_equal(result, expected)
922+
892923
def test_operators_timedelta64(self):
893924
from datetime import timedelta
894925
df = DataFrame(dict(A=date_range('2012-1-1', periods=3, freq='D'),

pandas/tests/series/test_analytics.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from pandas import (Series, Categorical, DataFrame, isna, notna,
1414
bdate_range, date_range, _np_version_under1p10,
15-
CategoricalIndex)
15+
CategoricalIndex, to_datetime, to_timedelta)
1616
from pandas.core.index import MultiIndex
1717
from pandas.core.indexes.datetimes import Timestamp
1818
from pandas.core.indexes.timedeltas import Timedelta
@@ -321,6 +321,27 @@ def test_mode(self):
321321
exp = Series(exp, dtype='category')
322322
tm.assert_series_equal(Series(c).mode(), exp)
323323

324+
@pytest.mark.parametrize('values, expected', [
325+
([np.nan, np.nan, 1], [np.nan]),
326+
([np.nan, 1], [1, np.nan]),
327+
([np.nan, np.nan, 'a'], np.array([np.nan], dtype=object)),
328+
([np.nan, 'a'], [np.nan, 'a']),
329+
(Categorical([np.nan, np.nan, 'a']),
330+
Categorical([np.nan], categories=['a'])),
331+
(Categorical([np.nan, np.nan, 1]),
332+
Categorical([np.nan], categories=[1])),
333+
(to_datetime(['NaT', '2000-1-2', 'NaT']), [pd.NaT]),
334+
(to_datetime(['NaT', '2000-1-2']), to_datetime(['NaT', '2000-1-2'])),
335+
(to_timedelta(['1 days', 'nan', 'nan']), to_timedelta(['NaT']))
336+
])
337+
def test_mode_dropna(self, values, expected):
338+
# GH 17534
339+
# Test the dropna=False parameter for mode
340+
341+
result = Series(values).mode(dropna=False)
342+
expected = Series(expected)
343+
tm.assert_series_equal(result, expected)
344+
324345
def test_prod(self):
325346
self._check_stat_op('prod', np.prod)
326347

0 commit comments

Comments
 (0)