Skip to content

Commit 984def2

Browse files
authored
ENH: making mode stable/keeping original ordering (#39353)
1 parent bc3adf2 commit 984def2

File tree

4 files changed

+86
-127
lines changed

4 files changed

+86
-127
lines changed

asv_bench/benchmarks/series_methods.py

+18-5
Original file line numberDiff line numberDiff line change
@@ -284,16 +284,29 @@ def time_clip(self, n):
284284

285285
class ValueCounts:
286286

287-
params = ["int", "uint", "float", "object"]
288-
param_names = ["dtype"]
287+
params = [[10 ** 3, 10 ** 4, 10 ** 5], ["int", "uint", "float", "object"]]
288+
param_names = ["N", "dtype"]
289289

290-
def setup(self, dtype):
291-
self.s = Series(np.random.randint(0, 1000, size=100000)).astype(dtype)
290+
def setup(self, N, dtype):
291+
self.s = Series(np.random.randint(0, N, size=10 * N)).astype(dtype)
292292

293-
def time_value_counts(self, dtype):
293+
def time_value_counts(self, N, dtype):
294294
self.s.value_counts()
295295

296296

297+
class Mode:
298+
299+
params = [[10 ** 3, 10 ** 4, 10 ** 5], ["int", "uint", "float", "object"]]
300+
param_names = ["N", "dtype"]
301+
302+
def setup(self, N, dtype):
303+
np.random.seed(42)
304+
self.s = Series(np.random.randint(0, N, size=10 * N)).astype(dtype)
305+
306+
def time_mode(self, N, dtype):
307+
self.s.mode()
308+
309+
297310
class Dir:
298311
def setup(self):
299312
self.s = Series(index=tm.makeStringIndex(10000))

doc/source/whatsnew/v1.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ Reshaping
364364
- Bug in :func:`join` over :class:`MultiIndex` returned wrong result, when one of both indexes had only one level (:issue:`36909`)
365365
- :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`)
366366
- Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`)
367-
- :meth:`Series.value_counts` returns keys in original order (:issue:`12679`, :issue:`11227`)
367+
- :meth:`Series.value_counts` and :meth:`Series.mode` return consistent keys in original order (:issue:`12679`, :issue:`11227` and :issue:`39007`)
368368
- Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`)
369369
-
370370

pandas/_libs/hashtable_func_helper.pxi.in

+49-121
Original file line numberDiff line numberDiff line change
@@ -28,52 +28,6 @@ dtypes = [('Complex128', 'complex128', 'complex128',
2828
{{for name, dtype, ttype, c_type, to_c_type in dtypes}}
2929

3030

31-
@cython.wraparound(False)
32-
@cython.boundscheck(False)
33-
{{if dtype == 'object'}}
34-
cdef build_count_table_{{dtype}}(ndarray[{{dtype}}] values,
35-
kh_{{ttype}}_t *table, bint dropna):
36-
{{else}}
37-
cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values,
38-
kh_{{ttype}}_t *table, bint dropna):
39-
{{endif}}
40-
cdef:
41-
khiter_t k
42-
Py_ssize_t i, n = len(values)
43-
44-
{{c_type}} val
45-
46-
int ret = 0
47-
48-
{{if dtype == 'object'}}
49-
kh_resize_{{ttype}}(table, n // 10)
50-
51-
for i in range(n):
52-
val = values[i]
53-
if not checknull(val) or not dropna:
54-
k = kh_get_{{ttype}}(table, <PyObject*>val)
55-
if k != table.n_buckets:
56-
table.vals[k] += 1
57-
else:
58-
k = kh_put_{{ttype}}(table, <PyObject*>val, &ret)
59-
table.vals[k] = 1
60-
{{else}}
61-
with nogil:
62-
kh_resize_{{ttype}}(table, n)
63-
64-
for i in range(n):
65-
val = {{to_c_type}}(values[i])
66-
67-
if not is_nan_{{c_type}}(val) or not dropna:
68-
k = kh_get_{{ttype}}(table, val)
69-
if k != table.n_buckets:
70-
table.vals[k] += 1
71-
else:
72-
k = kh_put_{{ttype}}(table, val, &ret)
73-
table.vals[k] = 1
74-
{{endif}}
75-
76-
7731
@cython.wraparound(False)
7832
@cython.boundscheck(False)
7933
{{if dtype == 'object'}}
@@ -84,8 +38,6 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
8438
cdef:
8539
Py_ssize_t i = 0
8640
Py_ssize_t n = len(values)
87-
size_t unique_key_index = 0
88-
size_t unique_key_count = 0
8941
kh_{{ttype}}_t *table
9042

9143
# Don't use Py_ssize_t, since table.n_buckets is unsigned
@@ -98,12 +50,10 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
9850

9951
# we track the order in which keys are first seen (GH39009),
10052
# khash-map isn't insertion-ordered, thus:
101-
# table maps key to index_of_appearence
102-
# result_keys maps index_of_appearence to key
103-
# result_counts maps index_of_appearence to number of elements
53+
# table maps keys to counts
54+
# result_keys remembers the original order of keys
10455

10556
result_keys = {{name}}Vector()
106-
result_counts = Int64Vector()
10757
table = kh_init_{{ttype}}()
10858

10959
{{if dtype == 'object'}}
@@ -118,14 +68,11 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
11868
val = navalue
11969
k = kh_get_{{ttype}}(table, <PyObject*>val)
12070
if k != table.n_buckets:
121-
unique_key_index = table.vals[k]
122-
result_counts.data.data[unique_key_index] += 1
71+
table.vals[k] += 1
12372
else:
12473
k = kh_put_{{ttype}}(table, <PyObject*>val, &ret)
125-
table.vals[k] = unique_key_count
74+
table.vals[k] = 1
12675
result_keys.append(val)
127-
result_counts.append(1)
128-
unique_key_count+=1
12976
{{else}}
13077
kh_resize_{{ttype}}(table, n)
13178

@@ -135,19 +82,26 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
13582
if not is_nan_{{c_type}}(val) or not dropna:
13683
k = kh_get_{{ttype}}(table, val)
13784
if k != table.n_buckets:
138-
unique_key_index = table.vals[k]
139-
result_counts.data.data[unique_key_index] += 1
85+
table.vals[k] += 1
14086
else:
14187
k = kh_put_{{ttype}}(table, val, &ret)
142-
table.vals[k] = unique_key_count
88+
table.vals[k] = 1
14389
result_keys.append(val)
144-
result_counts.append(1)
145-
unique_key_count+=1
14690
{{endif}}
14791

92+
# collect counts in the order corresponding to result_keys:
93+
cdef int64_t[:] result_counts = np.empty(table.size, dtype=np.int64)
94+
for i in range(table.size):
95+
{{if dtype == 'object'}}
96+
k = kh_get_{{ttype}}(table, result_keys.data[i])
97+
{{else}}
98+
k = kh_get_{{ttype}}(table, result_keys.data.data[i])
99+
{{endif}}
100+
result_counts[i] = table.vals[k]
101+
148102
kh_destroy_{{ttype}}(table)
149103

150-
return result_keys.to_array(), result_counts.to_array()
104+
return result_keys.to_array(), result_counts.base
151105

152106

153107
@cython.wraparound(False)
@@ -294,78 +248,42 @@ def ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
294248
kh_destroy_{{ttype}}(table)
295249
return result.view(np.bool_)
296250

297-
{{endfor}}
298-
299-
300251
# ----------------------------------------------------------------------
301252
# Mode Computations
302253
# ----------------------------------------------------------------------
303254

304-
{{py:
305-
306-
# dtype, ctype, table_type, npy_dtype
307-
dtypes = [('complex128', 'khcomplex128_t', 'complex128', 'complex128'),
308-
('complex64', 'khcomplex64_t', 'complex64', 'complex64'),
309-
('float64', 'float64_t', 'float64', 'float64'),
310-
('float32', 'float32_t', 'float32', 'float32'),
311-
('int64', 'int64_t', 'int64', 'int64'),
312-
('int32', 'int32_t', 'int32', 'int32'),
313-
('int16', 'int16_t', 'int16', 'int16'),
314-
('int8', 'int8_t', 'int8', 'int8'),
315-
('uint64', 'uint64_t', 'uint64', 'uint64'),
316-
('uint32', 'uint32_t', 'uint32', 'uint32'),
317-
('uint16', 'uint16_t', 'uint16', 'uint16'),
318-
('uint8', 'uint8_t', 'uint8', 'uint8'),
319-
('object', 'object', 'pymap', 'object_')]
320-
}}
321-
322-
{{for dtype, ctype, table_type, npy_dtype in dtypes}}
323-
324255

325256
@cython.wraparound(False)
326257
@cython.boundscheck(False)
327-
328258
{{if dtype == 'object'}}
329-
330-
331-
def mode_{{dtype}}(ndarray[{{ctype}}] values, bint dropna):
259+
def mode_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
332260
{{else}}
333-
334-
335261
def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
336262
{{endif}}
337263
cdef:
338-
int count, max_count = 1
339-
int j = -1 # so you can do +=
340-
# Don't use Py_ssize_t, since table.n_buckets is unsigned
341-
khiter_t k
342-
kh_{{table_type}}_t *table
343-
ndarray[{{ctype}}] modes
264+
{{if dtype == 'object'}}
265+
ndarray[{{dtype}}] keys
266+
ndarray[{{dtype}}] modes
267+
{{else}}
268+
{{dtype}}_t[:] keys
269+
ndarray[{{dtype}}_t] modes
270+
{{endif}}
271+
int64_t[:] counts
272+
int64_t count, max_count = -1
273+
Py_ssize_t k, j = 0
344274

345-
table = kh_init_{{table_type}}()
346-
build_count_table_{{dtype}}(values, table, dropna)
275+
keys, counts = value_count_{{dtype}}(values, dropna)
347276

348-
modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}})
277+
{{if dtype == 'object'}}
278+
modes = np.empty(len(keys), dtype=np.object_)
279+
{{else}}
280+
modes = np.empty(len(keys), dtype=np.{{dtype}})
281+
{{endif}}
349282

350283
{{if dtype != 'object'}}
351284
with nogil:
352-
for k in range(table.n_buckets):
353-
if kh_exist_{{table_type}}(table, k):
354-
count = table.vals[k]
355-
if count == max_count:
356-
j += 1
357-
elif count > max_count:
358-
max_count = count
359-
j = 0
360-
else:
361-
continue
362-
363-
modes[j] = table.keys[k]
364-
{{else}}
365-
for k in range(table.n_buckets):
366-
if kh_exist_{{table_type}}(table, k):
367-
count = table.vals[k]
368-
285+
for k in range(len(keys)):
286+
count = counts[k]
369287
if count == max_count:
370288
j += 1
371289
elif count > max_count:
@@ -374,11 +292,21 @@ def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
374292
else:
375293
continue
376294

377-
modes[j] = <object>table.keys[k]
295+
modes[j] = keys[k]
296+
{{else}}
297+
for k in range(len(keys)):
298+
count = counts[k]
299+
if count == max_count:
300+
j += 1
301+
elif count > max_count:
302+
max_count = count
303+
j = 0
304+
else:
305+
continue
306+
307+
modes[j] = keys[k]
378308
{{endif}}
379309

380-
kh_destroy_{{table_type}}(table)
381-
382310
return modes[:j + 1]
383311

384312
{{endfor}}

pandas/tests/libs/test_hashtable.py

+18
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from pandas._libs import hashtable as ht
88

9+
import pandas as pd
910
import pandas._testing as tm
1011

1112

@@ -323,6 +324,23 @@ def test_mode(self, dtype, type_suffix, writable):
323324
result = mode(values, False)
324325
assert result == 42
325326

327+
def test_mode_stable(self, dtype, type_suffix, writable):
328+
mode = get_ht_function("mode", type_suffix)
329+
values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
330+
values.flags.writeable = writable
331+
keys = mode(values, False)
332+
tm.assert_numpy_array_equal(keys, values)
333+
334+
335+
def test_modes_with_nans():
336+
# GH39007
337+
values = np.array([True, pd.NA, np.nan], dtype=np.object_)
338+
# pd.Na and np.nan will have the same representative: np.nan
339+
# thus we have 2 nans and 1 True
340+
modes = ht.mode_object(values, False)
341+
assert modes.size == 1
342+
assert np.isnan(modes[0])
343+
326344

327345
@pytest.mark.parametrize(
328346
"dtype, type_suffix",

0 commit comments

Comments
 (0)