Skip to content

Commit 0e3c2f6

Browse files
committed
ENH: Implement mode(dropna=False)
1 parent b02c69a commit 0e3c2f6

File tree

8 files changed

+102
-27
lines changed

8 files changed

+102
-27
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,7 @@ Other Enhancements
523523
library. (:issue:`20564`)
524524
- Added new writer for exporting Stata dta files in version 117, ``StataWriter117``. This format supports exporting strings with lengths up to 2,000,000 characters (:issue:`16450`)
525525
- :func:`to_hdf` and :func:`read_hdf` now accept an ``errors`` keyword argument to control encoding error handling (:issue:`20835`)
526+
- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`)
526527

527528
.. _whatsnew_0230.api_breaking:
528529

pandas/_libs/hashtable_func_helper.pxi.in

+6-6
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,8 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
288288
{{py:
289289

290290
# dtype, ctype, table_type, npy_dtype
291-
dtypes = [('int64', 'int64_t', 'int64', 'int64'),
291+
dtypes = [('float64', 'float64_t', 'float64', 'float64'),
292+
('int64', 'int64_t', 'int64', 'int64'),
292293
('uint64', 'uint64_t', 'uint64', 'uint64'),
293294
('object', 'object', 'pymap', 'object_')]
294295
}}
@@ -302,11 +303,11 @@ dtypes = [('int64', 'int64_t', 'int64', 'int64'),
302303
{{if dtype == 'object'}}
303304

304305

305-
def mode_{{dtype}}(ndarray[{{ctype}}] values):
306+
def mode_{{dtype}}(ndarray[{{ctype}}] values, bint dropna):
306307
{{else}}
307308

308309

309-
def mode_{{dtype}}({{ctype}}[:] values):
310+
def mode_{{dtype}}({{ctype}}[:] values, bint dropna):
310311
{{endif}}
311312
cdef:
312313
int count, max_count = 1
@@ -317,9 +318,9 @@ def mode_{{dtype}}({{ctype}}[:] values):
317318

318319
table = kh_init_{{table_type}}()
319320
{{if dtype == 'object'}}
320-
build_count_table_{{dtype}}(values, table, 1)
321+
build_count_table_{{dtype}}(values, table, dropna)
321322
{{else}}
322-
build_count_table_{{dtype}}(values, table, 0)
323+
build_count_table_{{dtype}}(values, table, dropna)
323324
{{endif}}
324325

325326
modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}})
@@ -329,7 +330,6 @@ def mode_{{dtype}}({{ctype}}[:] values):
329330
for k in range(table.n_buckets):
330331
if kh_exist_{{table_type}}(table, k):
331332
count = table.vals[k]
332-
333333
if count == max_count:
334334
j += 1
335335
elif count > max_count:

pandas/core/algorithms.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525
is_bool_dtype, needs_i8_conversion,
2626
is_datetimetz,
2727
is_datetime64_any_dtype, is_datetime64tz_dtype,
28-
is_timedelta64_dtype, is_interval_dtype,
29-
is_scalar, is_list_like,
28+
is_timedelta64_dtype, is_datetimelike,
29+
is_interval_dtype, is_scalar, is_list_like,
3030
_ensure_platform_int, _ensure_object,
3131
_ensure_float64, _ensure_uint64,
3232
_ensure_int64)
@@ -798,14 +798,16 @@ def duplicated(values, keep='first'):
798798
return f(values, keep=keep)
799799

800800

801-
def mode(values):
801+
def mode(values, dropna=True):
802802
"""
803803
Returns the mode(s) of an array.
804804
805805
Parameters
806806
----------
807807
values : array-like
808808
Array over which to check for duplicate values.
809+
dropna : boolean, default True
810+
Don't consider counts of NaN/NaT.
809811
810812
Returns
811813
-------
@@ -818,20 +820,18 @@ def mode(values):
818820

819821
# categorical is a fast-path
820822
if is_categorical_dtype(values):
821-
822823
if isinstance(values, Series):
823-
return Series(values.values.mode(), name=values.name)
824-
return values.mode()
824+
return Series(values.values.mode(dropna=dropna), name=values.name)
825+
return values.mode(dropna=dropna)
825826

826-
values, dtype, ndtype = _ensure_data(values)
827+
if dropna and is_datetimelike(values):
828+
mask = values.isnull()
829+
values = values[~mask]
827830

828-
# TODO: this should support float64
829-
if ndtype not in ['int64', 'uint64', 'object']:
830-
ndtype = 'object'
831-
values = _ensure_object(values)
831+
values, dtype, ndtype = _ensure_data(values)
832832

833833
f = getattr(htable, "mode_{dtype}".format(dtype=ndtype))
834-
result = f(values)
834+
result = f(values, dropna=dropna)
835835
try:
836836
result = np.sort(result)
837837
except TypeError as e:

pandas/core/arrays/categorical.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -2117,20 +2117,28 @@ def max(self, numeric_only=None, **kwargs):
21172117
else:
21182118
return self.categories[pointer]
21192119

2120-
def mode(self):
2120+
def mode(self, dropna=True):
21212121
"""
21222122
Returns the mode(s) of the Categorical.
21232123
21242124
Always returns `Categorical` even if only one value.
21252125
2126+
Parameters
2127+
----------
2128+
dropna : boolean, default True
2129+
Don't consider counts of NaN/NaT.
2130+
21262131
Returns
21272132
-------
21282133
modes : `Categorical` (sorted)
21292134
"""
21302135

21312136
import pandas._libs.hashtable as htable
2132-
good = self._codes != -1
2133-
values = sorted(htable.mode_int64(_ensure_int64(self._codes[good])))
2137+
values = self._codes
2138+
if dropna:
2139+
good = self._codes != -1
2140+
values = self._codes[good]
2141+
values = sorted(htable.mode_int64(_ensure_int64(values), dropna))
21342142
result = self._constructor(values=values, categories=self.categories,
21352143
ordered=self.ordered, fastpath=True)
21362144
return result

pandas/core/frame.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -7038,7 +7038,7 @@ def _get_agg_axis(self, axis_num):
70387038
else:
70397039
raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)
70407040

7041-
def mode(self, axis=0, numeric_only=False):
7041+
def mode(self, axis=0, numeric_only=False, dropna=True):
70427042
"""
70437043
Gets the mode(s) of each element along the axis selected. Adds a row
70447044
for each mode per label, fills in gaps with nan.
@@ -7056,6 +7056,8 @@ def mode(self, axis=0, numeric_only=False):
70567056
* 1 or 'columns' : get mode of each row
70577057
numeric_only : boolean, default False
70587058
if True, only apply to numeric columns
7059+
dropna : boolean, default True
7060+
Don't consider counts of NaN/NaT.
70597061
70607062
Returns
70617063
-------
@@ -7072,7 +7074,7 @@ def mode(self, axis=0, numeric_only=False):
70727074
data = self if not numeric_only else self._get_numeric_data()
70737075

70747076
def f(s):
7075-
return s.mode()
7077+
return s.mode(dropna=dropna)
70767078

70777079
return data.apply(f, axis=axis)
70787080

pandas/core/series.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -1431,17 +1431,22 @@ def count(self, level=None):
14311431
return self._constructor(out, index=lev,
14321432
dtype='int64').__finalize__(self)
14331433

1434-
def mode(self):
1434+
def mode(self, dropna=True):
14351435
"""Return the mode(s) of the dataset.
14361436
14371437
Always returns Series even if only one value is returned.
14381438
1439+
Parameters
1440+
-------
1441+
dropna : boolean, default True
1442+
Don't consider counts of NaN/NaT.
1443+
14391444
Returns
14401445
-------
14411446
modes : Series (sorted)
14421447
"""
14431448
# TODO: Add option for bins like value_counts()
1444-
return algorithms.mode(self)
1449+
return algorithms.mode(self, dropna=dropna)
14451450

14461451
def unique(self):
14471452
"""

pandas/tests/frame/test_analytics.py

+36-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
from pandas.compat import lrange, product, PY35
1616
from pandas import (compat, isna, notna, DataFrame, Series,
1717
MultiIndex, date_range, Timestamp, Categorical,
18-
_np_version_under1p12, _np_version_under1p15)
18+
_np_version_under1p12, _np_version_under1p15,
19+
to_datetime, to_timedelta)
1920
import pandas as pd
2021
import pandas.core.nanops as nanops
2122
import pandas.core.algorithms as algorithms
@@ -889,6 +890,40 @@ def test_mode(self):
889890
dtype=df["C"].dtype)})
890891
tm.assert_frame_equal(df.mode(), exp)
891892

893+
def test_mode_dropna(self):
894+
# GH 17534
895+
# Test the dropna=False parameter for mode
896+
897+
df = pd.DataFrame({"A": [1, np.nan, np.nan, np.nan],
898+
"B": [np.nan, np.nan, 'a', np.nan],
899+
"C": Categorical([np.nan, np.nan, 'a', np.nan]),
900+
"D": to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
901+
"E": to_timedelta(['1 days', 'nan', 'nan', 'nan']),
902+
"F": [1, 1, np.nan, np.nan],
903+
"G": [np.nan, np.nan, 'a', 'a'],
904+
"H": Categorical(['a', np.nan, 'a', np.nan]),
905+
"I": to_datetime(['2000-1-2', '2000-1-2',
906+
'NaT', 'NaT']),
907+
"J": to_timedelta(['1 days', 'nan',
908+
'1 days', 'nan'])})
909+
910+
result = df.loc[:, 'A':'E'].mode(dropna=False)
911+
expected = pd.DataFrame({'A': [np.nan],
912+
'B': np.array([np.nan], dtype=object),
913+
'C': Categorical([np.nan], categories=['a']),
914+
'D': [pd.NaT],
915+
'E': to_timedelta([pd.NaT])})
916+
tm.assert_frame_equal(result, expected)
917+
918+
result = df.loc[:, 'F':'J'].mode(dropna=False)
919+
expected = pd.DataFrame({'F': [1, np.nan],
920+
'G': [np.nan, 'a'],
921+
'H': Categorical([np.nan, 'a'],
922+
categories=['a']),
923+
'I': to_datetime(['NaT', '2000-1-2']),
924+
'J': to_timedelta(['nan', '1 days'])})
925+
tm.assert_frame_equal(result, expected)
926+
892927
def test_operators_timedelta64(self):
893928
from datetime import timedelta
894929
df = DataFrame(dict(A=date_range('2012-1-1', periods=3, freq='D'),

pandas/tests/series/test_analytics.py

+25-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from pandas import (Series, Categorical, DataFrame, isna, notna,
1414
bdate_range, date_range, _np_version_under1p10,
15-
CategoricalIndex)
15+
CategoricalIndex, to_datetime, to_timedelta)
1616
from pandas.core.index import MultiIndex
1717
from pandas.core.indexes.datetimes import Timestamp
1818
from pandas.core.indexes.timedeltas import Timedelta
@@ -321,6 +321,30 @@ def test_mode(self):
321321
exp = Series(exp, dtype='category')
322322
tm.assert_series_equal(Series(c).mode(), exp)
323323

324+
@pytest.mark.parametrize('values, expected', [
325+
([np.nan, np.nan, 1], [np.nan]),
326+
([np.nan, 1], [1, np.nan]),
327+
([np.nan, np.nan, 'a'], np.array([np.nan], dtype=object)),
328+
([np.nan, 'a'], [np.nan, 'a']),
329+
(Categorical([np.nan, np.nan, 'a']),
330+
Categorical([np.nan], categories=['a'])),
331+
(Categorical([np.nan, 'a']),
332+
Categorical([np.nan, 'a'], categories=['a'])),
333+
(Categorical([np.nan, np.nan, 1]),
334+
Categorical([np.nan], categories=[1])),
335+
(to_datetime(['NaT', '2000-1-2', 'NaT']), [pd.NaT]),
336+
(to_datetime(['NaT', '2000-1-2']), to_datetime(['NaT', '2000-1-2'])),
337+
(to_timedelta(['1 days', 'nan', 'nan']), to_timedelta(['NaT'])),
338+
(to_timedelta(['1 days', 'nan']), to_timedelta(['nan', '1 days']))
339+
])
340+
def test_mode_dropna(self, values, expected):
341+
# GH 17534
342+
# Test the dropna=False parameter for mode
343+
344+
result = Series(values).mode(dropna=False)
345+
expected = Series(expected)
346+
tm.assert_series_equal(result, expected)
347+
324348
def test_prod(self):
325349
self._check_stat_op('prod', np.prod)
326350

0 commit comments

Comments
 (0)