Skip to content

Commit 6816c63

Browse files
committed
ENH: mode(dropna=False) (pandas-dev#17534)
1 parent 3a2e9e6 commit 6816c63

File tree

6 files changed

+48
-20
lines changed

6 files changed

+48
-20
lines changed

pandas/_libs/hashtable_func_helper.pxi.in

+10-7
Original file line numberDiff line numberDiff line change
@@ -39,20 +39,23 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values,
3939
int ret = 0
4040

4141
{{if dtype == 'object'}}
42+
print('object way')
4243
kh_resize_{{ttype}}(table, n // 10)
4344

4445
for i in range(n):
4546
val = values[i]
4647

4748
if not checknull(val) or not dropna:
4849
k = kh_get_{{ttype}}(table, <PyObject*> val)
50+
print(k, val, table.n_buckets)
4951
if k != table.n_buckets:
5052
table.vals[k] += 1
5153
else:
5254
k = kh_put_{{ttype}}(table, <PyObject*> val, &ret)
5355
table.vals[k] = 1
5456
{{else}}
5557
with nogil:
58+
#print('non-object way')
5659
kh_resize_{{ttype}}(table, n)
5760

5861
for i in range(n):
@@ -288,25 +291,25 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
288291
{{py:
289292

290293
# dtype, ctype, table_type, npy_dtype
291-
dtypes = [('int64', 'int64_t', 'int64', 'int64'),
294+
dtypes = [('float64', 'float64_t', 'float64', 'float64'),
295+
('int64', 'int64_t', 'int64', 'int64'),
292296
('uint64', 'uint64_t', 'uint64', 'uint64'),
293297
('object', 'object', 'pymap', 'object_')]
294298
}}
295299

296300
{{for dtype, ctype, table_type, npy_dtype in dtypes}}
297301

298-
299302
@cython.wraparound(False)
300303
@cython.boundscheck(False)
301304

302305
{{if dtype == 'object'}}
303306

304307

305-
def mode_{{dtype}}(ndarray[{{ctype}}] values):
308+
def mode_{{dtype}}(ndarray[{{ctype}}] values, bint dropna):
306309
{{else}}
307310

308311

309-
def mode_{{dtype}}({{ctype}}[:] values):
312+
def mode_{{dtype}}({{ctype}}[:] values, bint dropna):
310313
{{endif}}
311314
cdef:
312315
int count, max_count = 1
@@ -317,9 +320,9 @@ def mode_{{dtype}}({{ctype}}[:] values):
317320

318321
table = kh_init_{{table_type}}()
319322
{{if dtype == 'object'}}
320-
build_count_table_{{dtype}}(values, table, 1)
323+
build_count_table_{{dtype}}(values, table, dropna)
321324
{{else}}
322-
build_count_table_{{dtype}}(values, table, 0)
325+
build_count_table_{{dtype}}(values, table, dropna)
323326
{{endif}}
324327

325328
modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}})
@@ -329,7 +332,6 @@ def mode_{{dtype}}({{ctype}}[:] values):
329332
for k in range(table.n_buckets):
330333
if kh_exist_{{table_type}}(table, k):
331334
count = table.vals[k]
332-
333335
if count == max_count:
334336
j += 1
335337
elif count > max_count:
@@ -342,6 +344,7 @@ def mode_{{dtype}}({{ctype}}[:] values):
342344
{{else}}
343345
for k in range(table.n_buckets):
344346
if kh_exist_{{table_type}}(table, k):
347+
print(<object> table.keys[k], table.vals[k])
345348
count = table.vals[k]
346349

347350
if count == max_count:

pandas/core/algorithms.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -791,7 +791,7 @@ def duplicated(values, keep='first'):
791791
return f(values, keep=keep)
792792

793793

794-
def mode(values):
794+
def mode(values, dropna=True):
795795
"""
796796
Returns the mode(s) of an array.
797797
@@ -803,6 +803,8 @@ def mode(values):
803803
Returns
804804
-------
805805
mode : Series
806+
dropna : boolean, default True
807+
Don't include counts of NaN.
806808
"""
807809
from pandas import Series
808810

@@ -818,13 +820,15 @@ def mode(values):
818820

819821
values, dtype, ndtype = _ensure_data(values)
820822

821-
# TODO: this should support float64
822-
if ndtype not in ['int64', 'uint64', 'object']:
823-
ndtype = 'object'
824-
values = _ensure_object(values)
823+
print('dtype', values.dtype)
824+
## TODO: this should support float64
825+
#if ndtype not in ['int64', 'uint64', 'object']:
826+
# ndtype = 'object'
827+
# values = _ensure_object(values)
825828

826829
f = getattr(htable, "mode_{dtype}".format(dtype=ndtype))
827-
result = f(values)
830+
print('dtype', values.dtype)
831+
result = f(values, dropna=dropna)
828832
try:
829833
result = np.sort(result)
830834
except TypeError as e:

pandas/core/arrays/categorical.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -2044,7 +2044,7 @@ def max(self, numeric_only=None, **kwargs):
20442044
else:
20452045
return self.categories[pointer]
20462046

2047-
def mode(self):
2047+
def mode(self, dropna=True):
20482048
"""
20492049
Returns the mode(s) of the Categorical.
20502050
@@ -2053,11 +2053,16 @@ def mode(self):
20532053
Returns
20542054
-------
20552055
modes : `Categorical` (sorted)
2056+
dropna : boolean, default True
2057+
Don't include counts of NaN.
20562058
"""
20572059

20582060
import pandas._libs.hashtable as htable
2059-
good = self._codes != -1
2060-
values = sorted(htable.mode_int64(_ensure_int64(self._codes[good])))
2061+
#if dropna:
2062+
# good = self._codes != -1
2063+
# values = sorted(htable.mode_int64(_ensure_int64(self._codes[good]), dropna))
2064+
#else:
2065+
values = sorted(htable.mode_int64(_ensure_int64(self._codes), dropna))
20612066
result = self._constructor(values=values, categories=self.categories,
20622067
ordered=self.ordered, fastpath=True)
20632068
return result

pandas/core/frame.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -6992,7 +6992,7 @@ def _get_agg_axis(self, axis_num):
69926992
else:
69936993
raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)
69946994

6995-
def mode(self, axis=0, numeric_only=False):
6995+
def mode(self, axis=0, numeric_only=False, dropna=True):
69966996
"""
69976997
Gets the mode(s) of each element along the axis selected. Adds a row
69986998
for each mode per label, fills in gaps with nan.
@@ -7010,6 +7010,8 @@ def mode(self, axis=0, numeric_only=False):
70107010
* 1 or 'columns' : get mode of each row
70117011
numeric_only : boolean, default False
70127012
if True, only apply to numeric columns
7013+
dropna : boolean, default True
7014+
Don't include counts of NaN.
70137015
70147016
Returns
70157017
-------
@@ -7026,7 +7028,7 @@ def mode(self, axis=0, numeric_only=False):
70267028
data = self if not numeric_only else self._get_numeric_data()
70277029

70287030
def f(s):
7029-
return s.mode()
7031+
return s.mode(dropna=dropna)
70307032

70317033
return data.apply(f, axis=axis)
70327034

pandas/core/series.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -1420,17 +1420,19 @@ def count(self, level=None):
14201420
return self._constructor(out, index=lev,
14211421
dtype='int64').__finalize__(self)
14221422

1423-
def mode(self):
1423+
def mode(self, dropna=True):
14241424
"""Return the mode(s) of the dataset.
14251425
14261426
Always returns Series even if only one value is returned.
14271427
14281428
Returns
14291429
-------
14301430
modes : Series (sorted)
1431+
1432+
14311433
"""
14321434
# TODO: Add option for bins like value_counts()
1433-
return algorithms.mode(self)
1435+
return algorithms.mode(self, dropna=dropna)
14341436

14351437
def unique(self):
14361438
"""

pandas/tests/series/test_analytics.py

+12
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,18 @@ def test_mode(self):
321321
exp = Series(exp, dtype='category')
322322
tm.assert_series_equal(Series(c).mode(), exp)
323323

324+
@pytest.mark.parametrize('data, expected', [
325+
([np.nan, np.nan, 1], [np.nan]),
326+
([np.nan, np.nan, 'a'], np.array([np.nan], dtype=object)),
327+
(Categorical([np.nan, np.nan, 'a']),
328+
Categorical([np.nan], categories=['a']))
329+
])
330+
def test_mode_dropna(self, data, expected):
331+
332+
result = Series(data).mode(dropna=False)
333+
expected = Series(expected)
334+
tm.assert_series_equal(result, expected)
335+
324336
def test_prod(self):
325337
self._check_stat_op('prod', np.prod)
326338

0 commit comments

Comments
 (0)