Skip to content

Commit f1631be

Browse files
reidy-pjreback
authored andcommitted
ENH: Implement mode(dropna=False) (#20779)
1 parent b237b11 commit f1631be

File tree

8 files changed

+293
-191
lines changed

8 files changed

+293
-191
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ New features
1313
Other Enhancements
1414
^^^^^^^^^^^^^^^^^^
1515
- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`)
16+
- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`)
1617
- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)
1718
-
1819

pandas/_libs/hashtable_func_helper.pxi.in

+6-6
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,8 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
288288
{{py:
289289

290290
# dtype, ctype, table_type, npy_dtype
291-
dtypes = [('int64', 'int64_t', 'int64', 'int64'),
291+
dtypes = [('float64', 'float64_t', 'float64', 'float64'),
292+
('int64', 'int64_t', 'int64', 'int64'),
292293
('uint64', 'uint64_t', 'uint64', 'uint64'),
293294
('object', 'object', 'pymap', 'object_')]
294295
}}
@@ -302,11 +303,11 @@ dtypes = [('int64', 'int64_t', 'int64', 'int64'),
302303
{{if dtype == 'object'}}
303304

304305

305-
def mode_{{dtype}}(ndarray[{{ctype}}] values):
306+
def mode_{{dtype}}(ndarray[{{ctype}}] values, bint dropna):
306307
{{else}}
307308

308309

309-
def mode_{{dtype}}({{ctype}}[:] values):
310+
def mode_{{dtype}}({{ctype}}[:] values, bint dropna):
310311
{{endif}}
311312
cdef:
312313
int count, max_count = 1
@@ -317,9 +318,9 @@ def mode_{{dtype}}({{ctype}}[:] values):
317318

318319
table = kh_init_{{table_type}}()
319320
{{if dtype == 'object'}}
320-
build_count_table_{{dtype}}(values, table, 1)
321+
build_count_table_{{dtype}}(values, table, dropna)
321322
{{else}}
322-
build_count_table_{{dtype}}(values, table, 0)
323+
build_count_table_{{dtype}}(values, table, dropna)
323324
{{endif}}
324325

325326
modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}})
@@ -329,7 +330,6 @@ def mode_{{dtype}}({{ctype}}[:] values):
329330
for k in range(table.n_buckets):
330331
if kh_exist_{{table_type}}(table, k):
331332
count = table.vals[k]
332-
333333
if count == max_count:
334334
j += 1
335335
elif count > max_count:

pandas/core/algorithms.py

+14-12
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525
is_bool_dtype, needs_i8_conversion,
2626
is_datetimetz,
2727
is_datetime64_any_dtype, is_datetime64tz_dtype,
28-
is_timedelta64_dtype, is_interval_dtype,
29-
is_scalar, is_list_like,
28+
is_timedelta64_dtype, is_datetimelike,
29+
is_interval_dtype, is_scalar, is_list_like,
3030
_ensure_platform_int, _ensure_object,
3131
_ensure_float64, _ensure_uint64,
3232
_ensure_int64)
@@ -798,14 +798,18 @@ def duplicated(values, keep='first'):
798798
return f(values, keep=keep)
799799

800800

801-
def mode(values):
801+
def mode(values, dropna=True):
802802
"""
803803
Returns the mode(s) of an array.
804804
805805
Parameters
806806
----------
807807
values : array-like
808808
Array over which to check for duplicate values.
809+
dropna : boolean, default True
810+
Don't consider counts of NaN/NaT.
811+
812+
.. versionadded:: 0.24.0
809813
810814
Returns
811815
-------
@@ -818,20 +822,18 @@ def mode(values):
818822

819823
# categorical is a fast-path
820824
if is_categorical_dtype(values):
821-
822825
if isinstance(values, Series):
823-
return Series(values.values.mode(), name=values.name)
824-
return values.mode()
826+
return Series(values.values.mode(dropna=dropna), name=values.name)
827+
return values.mode(dropna=dropna)
825828

826-
values, dtype, ndtype = _ensure_data(values)
829+
if dropna and is_datetimelike(values):
830+
mask = values.isnull()
831+
values = values[~mask]
827832

828-
# TODO: this should support float64
829-
if ndtype not in ['int64', 'uint64', 'object']:
830-
ndtype = 'object'
831-
values = _ensure_object(values)
833+
values, dtype, ndtype = _ensure_data(values)
832834

833835
f = getattr(htable, "mode_{dtype}".format(dtype=ndtype))
834-
result = f(values)
836+
result = f(values, dropna=dropna)
835837
try:
836838
result = np.sort(result)
837839
except TypeError as e:

pandas/core/arrays/categorical.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -2118,20 +2118,30 @@ def max(self, numeric_only=None, **kwargs):
21182118
else:
21192119
return self.categories[pointer]
21202120

2121-
def mode(self):
2121+
def mode(self, dropna=True):
21222122
"""
21232123
Returns the mode(s) of the Categorical.
21242124
21252125
Always returns `Categorical` even if only one value.
21262126
2127+
Parameters
2128+
----------
2129+
dropna : boolean, default True
2130+
Don't consider counts of NaN/NaT.
2131+
2132+
.. versionadded:: 0.24.0
2133+
21272134
Returns
21282135
-------
21292136
modes : `Categorical` (sorted)
21302137
"""
21312138

21322139
import pandas._libs.hashtable as htable
2133-
good = self._codes != -1
2134-
values = sorted(htable.mode_int64(_ensure_int64(self._codes[good])))
2140+
values = self._codes
2141+
if dropna:
2142+
good = self._codes != -1
2143+
values = self._codes[good]
2144+
values = sorted(htable.mode_int64(_ensure_int64(values), dropna))
21352145
result = self._constructor(values=values, categories=self.categories,
21362146
ordered=self.ordered, fastpath=True)
21372147
return result

pandas/core/frame.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -7038,7 +7038,7 @@ def _get_agg_axis(self, axis_num):
70387038
else:
70397039
raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)
70407040

7041-
def mode(self, axis=0, numeric_only=False):
7041+
def mode(self, axis=0, numeric_only=False, dropna=True):
70427042
"""
70437043
Gets the mode(s) of each element along the axis selected. Adds a row
70447044
for each mode per label, fills in gaps with nan.
@@ -7056,6 +7056,10 @@ def mode(self, axis=0, numeric_only=False):
70567056
* 1 or 'columns' : get mode of each row
70577057
numeric_only : boolean, default False
70587058
if True, only apply to numeric columns
7059+
dropna : boolean, default True
7060+
Don't consider counts of NaN/NaT.
7061+
7062+
.. versionadded:: 0.24.0
70597063
70607064
Returns
70617065
-------
@@ -7072,7 +7076,7 @@ def mode(self, axis=0, numeric_only=False):
70727076
data = self if not numeric_only else self._get_numeric_data()
70737077

70747078
def f(s):
7075-
return s.mode()
7079+
return s.mode(dropna=dropna)
70767080

70777081
return data.apply(f, axis=axis)
70787082

pandas/core/series.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -1431,17 +1431,24 @@ def count(self, level=None):
14311431
return self._constructor(out, index=lev,
14321432
dtype='int64').__finalize__(self)
14331433

1434-
def mode(self):
1434+
def mode(self, dropna=True):
14351435
"""Return the mode(s) of the dataset.
14361436
14371437
Always returns Series even if only one value is returned.
14381438
1439+
Parameters
1440+
-------
1441+
dropna : boolean, default True
1442+
Don't consider counts of NaN/NaT.
1443+
1444+
.. versionadded:: 0.24.0
1445+
14391446
Returns
14401447
-------
14411448
modes : Series (sorted)
14421449
"""
14431450
# TODO: Add option for bins like value_counts()
1444-
return algorithms.mode(self)
1451+
return algorithms.mode(self, dropna=dropna)
14451452

14461453
def unique(self):
14471454
"""

pandas/tests/frame/test_analytics.py

+70-50
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@
1515
from pandas.compat import lrange, product, PY35
1616
from pandas import (compat, isna, notna, DataFrame, Series,
1717
MultiIndex, date_range, Timestamp, Categorical,
18-
_np_version_under1p12, _np_version_under1p15)
18+
_np_version_under1p12, _np_version_under1p15,
19+
to_datetime, to_timedelta)
1920
import pandas as pd
2021
import pandas.core.nanops as nanops
2122
import pandas.core.algorithms as algorithms
22-
import pandas.io.formats.printing as printing
2323

2424
import pandas.util.testing as tm
2525
import pandas.util._test_decorators as td
@@ -840,54 +840,74 @@ def wrapper(x):
840840
expected = pd.Series(unit, index=r1.index, dtype=r1.dtype)
841841
tm.assert_series_equal(r1, expected)
842842

843-
def test_mode(self):
844-
df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11],
845-
"B": [10, 10, 10, np.nan, 3, 4],
846-
"C": [8, 8, 8, 9, 9, 9],
847-
"D": np.arange(6, dtype='int64'),
848-
"E": [8, 8, 1, 1, 3, 3]})
849-
tm.assert_frame_equal(df[["A"]].mode(),
850-
pd.DataFrame({"A": [12]}))
851-
expected = pd.Series([0, 1, 2, 3, 4, 5], dtype='int64', name='D').\
852-
to_frame()
853-
tm.assert_frame_equal(df[["D"]].mode(), expected)
854-
expected = pd.Series([1, 3, 8], dtype='int64', name='E').to_frame()
855-
tm.assert_frame_equal(df[["E"]].mode(), expected)
856-
tm.assert_frame_equal(df[["A", "B"]].mode(),
857-
pd.DataFrame({"A": [12], "B": [10.]}))
858-
tm.assert_frame_equal(df.mode(),
859-
pd.DataFrame({"A": [12, np.nan, np.nan, np.nan,
860-
np.nan, np.nan],
861-
"B": [10, np.nan, np.nan, np.nan,
862-
np.nan, np.nan],
863-
"C": [8, 9, np.nan, np.nan, np.nan,
864-
np.nan],
865-
"D": [0, 1, 2, 3, 4, 5],
866-
"E": [1, 3, 8, np.nan, np.nan,
867-
np.nan]}))
868-
869-
# outputs in sorted order
870-
df["C"] = list(reversed(df["C"]))
871-
printing.pprint_thing(df["C"])
872-
printing.pprint_thing(df["C"].mode())
873-
a, b = (df[["A", "B", "C"]].mode(),
874-
pd.DataFrame({"A": [12, np.nan],
875-
"B": [10, np.nan],
876-
"C": [8, 9]}))
877-
printing.pprint_thing(a)
878-
printing.pprint_thing(b)
879-
tm.assert_frame_equal(a, b)
880-
# should work with heterogeneous types
881-
df = pd.DataFrame({"A": np.arange(6, dtype='int64'),
882-
"B": pd.date_range('2011', periods=6),
883-
"C": list('abcdef')})
884-
exp = pd.DataFrame({"A": pd.Series(np.arange(6, dtype='int64'),
885-
dtype=df["A"].dtype),
886-
"B": pd.Series(pd.date_range('2011', periods=6),
887-
dtype=df["B"].dtype),
888-
"C": pd.Series(list('abcdef'),
889-
dtype=df["C"].dtype)})
890-
tm.assert_frame_equal(df.mode(), exp)
843+
@pytest.mark.parametrize("dropna, expected", [
844+
(True, {'A': [12],
845+
'B': [10.0],
846+
'C': [1.0],
847+
'D': ['a'],
848+
'E': Categorical(['a'], categories=['a']),
849+
'F': to_datetime(['2000-1-2']),
850+
'G': to_timedelta(['1 days'])}),
851+
(False, {'A': [12],
852+
'B': [10.0],
853+
'C': [np.nan],
854+
'D': np.array([np.nan], dtype=object),
855+
'E': Categorical([np.nan], categories=['a']),
856+
'F': [pd.NaT],
857+
'G': to_timedelta([pd.NaT])}),
858+
(True, {'H': [8, 9, np.nan, np.nan],
859+
'I': [8, 9, np.nan, np.nan],
860+
'J': [1, np.nan, np.nan, np.nan],
861+
'K': Categorical(['a', np.nan, np.nan, np.nan],
862+
categories=['a']),
863+
'L': to_datetime(['2000-1-2', 'NaT', 'NaT', 'NaT']),
864+
'M': to_timedelta(['1 days', 'nan', 'nan', 'nan']),
865+
'N': [0, 1, 2, 3]}),
866+
(False, {'H': [8, 9, np.nan, np.nan],
867+
'I': [8, 9, np.nan, np.nan],
868+
'J': [1, np.nan, np.nan, np.nan],
869+
'K': Categorical([np.nan, 'a', np.nan, np.nan],
870+
categories=['a']),
871+
'L': to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
872+
'M': to_timedelta(['nan', '1 days', 'nan', 'nan']),
873+
'N': [0, 1, 2, 3]})
874+
])
875+
def test_mode_dropna(self, dropna, expected):
876+
877+
df = DataFrame({"A": [12, 12, 19, 11],
878+
"B": [10, 10, np.nan, 3],
879+
"C": [1, np.nan, np.nan, np.nan],
880+
"D": [np.nan, np.nan, 'a', np.nan],
881+
"E": Categorical([np.nan, np.nan, 'a', np.nan]),
882+
"F": to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
883+
"G": to_timedelta(['1 days', 'nan', 'nan', 'nan']),
884+
"H": [8, 8, 9, 9],
885+
"I": [9, 9, 8, 8],
886+
"J": [1, 1, np.nan, np.nan],
887+
"K": Categorical(['a', np.nan, 'a', np.nan]),
888+
"L": to_datetime(['2000-1-2', '2000-1-2',
889+
'NaT', 'NaT']),
890+
"M": to_timedelta(['1 days', 'nan',
891+
'1 days', 'nan']),
892+
"N": np.arange(4, dtype='int64')})
893+
894+
result = df[sorted(list(expected.keys()))].mode(dropna=dropna)
895+
expected = DataFrame(expected)
896+
tm.assert_frame_equal(result, expected)
897+
898+
@pytest.mark.skipif(not compat.PY3, reason="only PY3")
899+
def test_mode_sortwarning(self):
900+
# Check for the warning that is raised when the mode
901+
# results cannot be sorted
902+
903+
df = DataFrame({"A": [np.nan, np.nan, 'a', 'a']})
904+
expected = DataFrame({'A': ['a', np.nan]})
905+
906+
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
907+
result = df.mode(dropna=False)
908+
result = result.sort_values(by='A').reset_index(drop=True)
909+
910+
tm.assert_frame_equal(result, expected)
891911

892912
def test_operators_timedelta64(self):
893913
from datetime import timedelta

0 commit comments

Comments
 (0)