Skip to content

ENH: Implement mode(dropna=False) #20779

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
May 31, 2018
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ New features
Other Enhancements
^^^^^^^^^^^^^^^^^^
- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`)
- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`)
- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)
-

Expand Down
12 changes: 6 additions & 6 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,8 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
{{py:

# dtype, ctype, table_type, npy_dtype
dtypes = [('int64', 'int64_t', 'int64', 'int64'),
dtypes = [('float64', 'float64_t', 'float64', 'float64'),
('int64', 'int64_t', 'int64', 'int64'),
('uint64', 'uint64_t', 'uint64', 'uint64'),
('object', 'object', 'pymap', 'object_')]
}}
Expand All @@ -302,11 +303,11 @@ dtypes = [('int64', 'int64_t', 'int64', 'int64'),
{{if dtype == 'object'}}


def mode_{{dtype}}(ndarray[{{ctype}}] values):
def mode_{{dtype}}(ndarray[{{ctype}}] values, bint dropna):
{{else}}


def mode_{{dtype}}({{ctype}}[:] values):
def mode_{{dtype}}({{ctype}}[:] values, bint dropna):
{{endif}}
cdef:
int count, max_count = 1
Expand All @@ -317,9 +318,9 @@ def mode_{{dtype}}({{ctype}}[:] values):

table = kh_init_{{table_type}}()
{{if dtype == 'object'}}
build_count_table_{{dtype}}(values, table, 1)
build_count_table_{{dtype}}(values, table, dropna)
{{else}}
build_count_table_{{dtype}}(values, table, 0)
build_count_table_{{dtype}}(values, table, dropna)
{{endif}}

modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}})
Expand All @@ -329,7 +330,6 @@ def mode_{{dtype}}({{ctype}}[:] values):
for k in range(table.n_buckets):
if kh_exist_{{table_type}}(table, k):
count = table.vals[k]

if count == max_count:
j += 1
elif count > max_count:
Expand Down
26 changes: 14 additions & 12 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
is_bool_dtype, needs_i8_conversion,
is_datetimetz,
is_datetime64_any_dtype, is_datetime64tz_dtype,
is_timedelta64_dtype, is_interval_dtype,
is_scalar, is_list_like,
is_timedelta64_dtype, is_datetimelike,
is_interval_dtype, is_scalar, is_list_like,
_ensure_platform_int, _ensure_object,
_ensure_float64, _ensure_uint64,
_ensure_int64)
Expand Down Expand Up @@ -798,14 +798,18 @@ def duplicated(values, keep='first'):
return f(values, keep=keep)


def mode(values):
def mode(values, dropna=True):
"""
Returns the mode(s) of an array.

Parameters
----------
values : array-like
Array over which to check for duplicate values.
dropna : boolean, default True
Don't consider counts of NaN/NaT.

.. versionadded:: 0.24.0

Returns
-------
Expand All @@ -818,20 +822,18 @@ def mode(values):

# categorical is a fast-path
if is_categorical_dtype(values):

if isinstance(values, Series):
return Series(values.values.mode(), name=values.name)
return values.mode()
return Series(values.values.mode(dropna=dropna), name=values.name)
return values.mode(dropna=dropna)

values, dtype, ndtype = _ensure_data(values)
if dropna and is_datetimelike(values):
mask = values.isnull()
values = values[~mask]

# TODO: this should support float64
if ndtype not in ['int64', 'uint64', 'object']:
ndtype = 'object'
values = _ensure_object(values)
values, dtype, ndtype = _ensure_data(values)

f = getattr(htable, "mode_{dtype}".format(dtype=ndtype))
result = f(values)
result = f(values, dropna=dropna)
try:
result = np.sort(result)
except TypeError as e:
Expand Down
16 changes: 13 additions & 3 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2118,20 +2118,30 @@ def max(self, numeric_only=None, **kwargs):
else:
return self.categories[pointer]

def mode(self):
def mode(self, dropna=True):
"""
Returns the mode(s) of the Categorical.

Always returns `Categorical` even if only one value.

Parameters
----------
dropna : boolean, default True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add versionadded (anytime you add the param)

Don't consider counts of NaN/NaT.

.. versionadded:: 0.24.0

Returns
-------
modes : `Categorical` (sorted)
"""

import pandas._libs.hashtable as htable
good = self._codes != -1
values = sorted(htable.mode_int64(_ensure_int64(self._codes[good])))
values = self._codes
if dropna:
good = self._codes != -1
values = self._codes[good]
values = sorted(htable.mode_int64(_ensure_int64(values), dropna))
result = self._constructor(values=values, categories=self.categories,
ordered=self.ordered, fastpath=True)
return result
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7038,7 +7038,7 @@ def _get_agg_axis(self, axis_num):
else:
raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)

def mode(self, axis=0, numeric_only=False):
def mode(self, axis=0, numeric_only=False, dropna=True):
"""
Gets the mode(s) of each element along the axis selected. Adds a row
for each mode per label, fills in gaps with nan.
Expand All @@ -7056,6 +7056,10 @@ def mode(self, axis=0, numeric_only=False):
* 1 or 'columns' : get mode of each row
numeric_only : boolean, default False
if True, only apply to numeric columns
dropna : boolean, default True
Don't consider counts of NaN/NaT.

.. versionadded:: 0.24.0

Returns
-------
Expand All @@ -7072,7 +7076,7 @@ def mode(self, axis=0, numeric_only=False):
data = self if not numeric_only else self._get_numeric_data()

def f(s):
return s.mode()
return s.mode(dropna=dropna)

return data.apply(f, axis=axis)

Expand Down
11 changes: 9 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1431,17 +1431,24 @@ def count(self, level=None):
return self._constructor(out, index=lev,
dtype='int64').__finalize__(self)

def mode(self):
def mode(self, dropna=True):
"""Return the mode(s) of the dataset.

Always returns Series even if only one value is returned.

Parameters
-------
dropna : boolean, default True
Don't consider counts of NaN/NaT.

.. versionadded:: 0.24.0

Returns
-------
modes : Series (sorted)
"""
# TODO: Add option for bins like value_counts()
return algorithms.mode(self)
return algorithms.mode(self, dropna=dropna)

def unique(self):
"""
Expand Down
120 changes: 70 additions & 50 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
from pandas.compat import lrange, product, PY35
from pandas import (compat, isna, notna, DataFrame, Series,
MultiIndex, date_range, Timestamp, Categorical,
_np_version_under1p12, _np_version_under1p15)
_np_version_under1p12, _np_version_under1p15,
to_datetime, to_timedelta)
import pandas as pd
import pandas.core.nanops as nanops
import pandas.core.algorithms as algorithms
import pandas.io.formats.printing as printing

import pandas.util.testing as tm
import pandas.util._test_decorators as td
Expand Down Expand Up @@ -840,54 +840,74 @@ def wrapper(x):
expected = pd.Series(unit, index=r1.index, dtype=r1.dtype)
tm.assert_series_equal(r1, expected)

def test_mode(self):
df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11],
"B": [10, 10, 10, np.nan, 3, 4],
"C": [8, 8, 8, 9, 9, 9],
"D": np.arange(6, dtype='int64'),
"E": [8, 8, 1, 1, 3, 3]})
tm.assert_frame_equal(df[["A"]].mode(),
pd.DataFrame({"A": [12]}))
expected = pd.Series([0, 1, 2, 3, 4, 5], dtype='int64', name='D').\
to_frame()
tm.assert_frame_equal(df[["D"]].mode(), expected)
expected = pd.Series([1, 3, 8], dtype='int64', name='E').to_frame()
tm.assert_frame_equal(df[["E"]].mode(), expected)
tm.assert_frame_equal(df[["A", "B"]].mode(),
pd.DataFrame({"A": [12], "B": [10.]}))
tm.assert_frame_equal(df.mode(),
pd.DataFrame({"A": [12, np.nan, np.nan, np.nan,
np.nan, np.nan],
"B": [10, np.nan, np.nan, np.nan,
np.nan, np.nan],
"C": [8, 9, np.nan, np.nan, np.nan,
np.nan],
"D": [0, 1, 2, 3, 4, 5],
"E": [1, 3, 8, np.nan, np.nan,
np.nan]}))

# outputs in sorted order
df["C"] = list(reversed(df["C"]))
printing.pprint_thing(df["C"])
printing.pprint_thing(df["C"].mode())
a, b = (df[["A", "B", "C"]].mode(),
pd.DataFrame({"A": [12, np.nan],
"B": [10, np.nan],
"C": [8, 9]}))
printing.pprint_thing(a)
printing.pprint_thing(b)
tm.assert_frame_equal(a, b)
# should work with heterogeneous types
df = pd.DataFrame({"A": np.arange(6, dtype='int64'),
"B": pd.date_range('2011', periods=6),
"C": list('abcdef')})
exp = pd.DataFrame({"A": pd.Series(np.arange(6, dtype='int64'),
dtype=df["A"].dtype),
"B": pd.Series(pd.date_range('2011', periods=6),
dtype=df["B"].dtype),
"C": pd.Series(list('abcdef'),
dtype=df["C"].dtype)})
tm.assert_frame_equal(df.mode(), exp)
@pytest.mark.parametrize("dropna, expected", [
(True, {'A': [12],
'B': [10.0],
'C': [1.0],
'D': ['a'],
'E': Categorical(['a'], categories=['a']),
'F': to_datetime(['2000-1-2']),
'G': to_timedelta(['1 days'])}),
(False, {'A': [12],
'B': [10.0],
'C': [np.nan],
'D': np.array([np.nan], dtype=object),
'E': Categorical([np.nan], categories=['a']),
'F': [pd.NaT],
'G': to_timedelta([pd.NaT])}),
(True, {'H': [8, 9, np.nan, np.nan],
'I': [8, 9, np.nan, np.nan],
'J': [1, np.nan, np.nan, np.nan],
'K': Categorical(['a', np.nan, np.nan, np.nan],
categories=['a']),
'L': to_datetime(['2000-1-2', 'NaT', 'NaT', 'NaT']),
'M': to_timedelta(['1 days', 'nan', 'nan', 'nan']),
'N': [0, 1, 2, 3]}),
(False, {'H': [8, 9, np.nan, np.nan],
'I': [8, 9, np.nan, np.nan],
'J': [1, np.nan, np.nan, np.nan],
'K': Categorical([np.nan, 'a', np.nan, np.nan],
categories=['a']),
'L': to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
'M': to_timedelta(['nan', '1 days', 'nan', 'nan']),
'N': [0, 1, 2, 3]})
])
def test_mode_dropna(self, dropna, expected):

df = DataFrame({"A": [12, 12, 19, 11],
"B": [10, 10, np.nan, 3],
"C": [1, np.nan, np.nan, np.nan],
"D": [np.nan, np.nan, 'a', np.nan],
"E": Categorical([np.nan, np.nan, 'a', np.nan]),
"F": to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
"G": to_timedelta(['1 days', 'nan', 'nan', 'nan']),
"H": [8, 8, 9, 9],
"I": [9, 9, 8, 8],
"J": [1, 1, np.nan, np.nan],
"K": Categorical(['a', np.nan, 'a', np.nan]),
"L": to_datetime(['2000-1-2', '2000-1-2',
'NaT', 'NaT']),
"M": to_timedelta(['1 days', 'nan',
'1 days', 'nan']),
"N": np.arange(4, dtype='int64')})

result = df[sorted(list(expected.keys()))].mode(dropna=dropna)
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)

@pytest.mark.skipif(not compat.PY3, reason="only PY3")
def test_mode_sortwarning(self):
# Check for the warning that is raised when the mode
# results cannot be sorted

df = DataFrame({"A": [np.nan, np.nan, 'a', 'a']})
expected = DataFrame({'A': ['a', np.nan]})

with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
result = df.mode(dropna=False)
result = result.sort_values(by='A').reset_index(drop=True)

tm.assert_frame_equal(result, expected)

def test_operators_timedelta64(self):
from datetime import timedelta
Expand Down
Loading