diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index e3ab8b243e25a..3214b21133b72 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -284,16 +284,29 @@ def time_clip(self, n): class ValueCounts: - params = ["int", "uint", "float", "object"] - param_names = ["dtype"] + params = [[10 ** 3, 10 ** 4, 10 ** 5], ["int", "uint", "float", "object"]] + param_names = ["N", "dtype"] - def setup(self, dtype): - self.s = Series(np.random.randint(0, 1000, size=100000)).astype(dtype) + def setup(self, N, dtype): + self.s = Series(np.random.randint(0, N, size=10 * N)).astype(dtype) - def time_value_counts(self, dtype): + def time_value_counts(self, N, dtype): self.s.value_counts() +class Mode: + + params = [[10 ** 3, 10 ** 4, 10 ** 5], ["int", "uint", "float", "object"]] + param_names = ["N", "dtype"] + + def setup(self, N, dtype): + np.random.seed(42) + self.s = Series(np.random.randint(0, N, size=10 * N)).astype(dtype) + + def time_mode(self, N, dtype): + self.s.mode() + + class Dir: def setup(self): self.s = Series(index=tm.makeStringIndex(10000)) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 381a05a18b278..284371efb05b2 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -359,7 +359,7 @@ Reshaping - Bug in :func:`join` over :class:`MultiIndex` returned wrong result, when one of both indexes had only one level (:issue:`36909`) - :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`) - Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`) -- :meth:`Series.value_counts` returns keys in original order (:issue:`12679`, :issue:`11227`) +- :meth:`Series.value_counts` and :meth:`Series.mode` return consistent keys in original order (:issue:`12679`, :issue:`11227` and :issue:`39007`) - Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`) - diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index b4da5a3c7fb09..4684ecb8716c0 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -28,52 +28,6 @@ dtypes = [('Complex128', 'complex128', 'complex128', {{for name, dtype, ttype, c_type, to_c_type in dtypes}} -@cython.wraparound(False) -@cython.boundscheck(False) -{{if dtype == 'object'}} -cdef build_count_table_{{dtype}}(ndarray[{{dtype}}] values, - kh_{{ttype}}_t *table, bint dropna): -{{else}} -cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values, - kh_{{ttype}}_t *table, bint dropna): -{{endif}} - cdef: - khiter_t k - Py_ssize_t i, n = len(values) - - {{c_type}} val - - int ret = 0 - - {{if dtype == 'object'}} - kh_resize_{{ttype}}(table, n // 10) - - for i in range(n): - val = values[i] - if not checknull(val) or not dropna: - k = kh_get_{{ttype}}(table, val) - if k != table.n_buckets: - table.vals[k] += 1 - else: - k = kh_put_{{ttype}}(table, val, &ret) - table.vals[k] = 1 - {{else}} - with nogil: - kh_resize_{{ttype}}(table, n) - - for i in range(n): - val = {{to_c_type}}(values[i]) - - if not is_nan_{{c_type}}(val) or not dropna: - k = kh_get_{{ttype}}(table, val) - if k != table.n_buckets: - table.vals[k] += 1 - else: - k = kh_put_{{ttype}}(table, val, &ret) - table.vals[k] = 1 - {{endif}} - - @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} @@ -84,8 +38,6 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): cdef: Py_ssize_t i = 0 Py_ssize_t n = len(values) - size_t unique_key_index = 0 - size_t unique_key_count = 0 kh_{{ttype}}_t *table # Don't use Py_ssize_t, since table.n_buckets is unsigned @@ -98,12 +50,10 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): # we track the order in which keys are first seen (GH39009), # khash-map isn't insertion-ordered, thus: - # table maps key to index_of_appearence - # result_keys maps index_of_appearence to key - # result_counts maps index_of_appearence to number of elements + # table maps keys to counts + # result_keys remembers the original order of keys result_keys = {{name}}Vector() - result_counts = Int64Vector() table = kh_init_{{ttype}}() {{if dtype == 'object'}} @@ -118,14 +68,11 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): val = navalue k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: - unique_key_index = table.vals[k] - result_counts.data.data[unique_key_index] += 1 + table.vals[k] += 1 else: k = kh_put_{{ttype}}(table, val, &ret) - table.vals[k] = unique_key_count + table.vals[k] = 1 result_keys.append(val) - result_counts.append(1) - unique_key_count+=1 {{else}} kh_resize_{{ttype}}(table, n) @@ -135,19 +82,26 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): if not is_nan_{{c_type}}(val) or not dropna: k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: - unique_key_index = table.vals[k] - result_counts.data.data[unique_key_index] += 1 + table.vals[k] += 1 else: k = kh_put_{{ttype}}(table, val, &ret) - table.vals[k] = unique_key_count + table.vals[k] = 1 result_keys.append(val) - result_counts.append(1) - unique_key_count+=1 {{endif}} + # collect counts in the order corresponding to result_keys: + cdef int64_t[:] result_counts = np.empty(table.size, dtype=np.int64) + for i in range(table.size): + {{if dtype == 'object'}} + k = kh_get_{{ttype}}(table, result_keys.data[i]) + {{else}} + k = kh_get_{{ttype}}(table, result_keys.data.data[i]) + {{endif}} + result_counts[i] = table.vals[k] + kh_destroy_{{ttype}}(table) - return result_keys.to_array(), result_counts.to_array() + return result_keys.to_array(), result_counts.base @cython.wraparound(False) @@ -294,78 +248,42 @@ def ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): kh_destroy_{{ttype}}(table) return result.view(np.bool_) -{{endfor}} - - # ---------------------------------------------------------------------- # Mode Computations # ---------------------------------------------------------------------- -{{py: - -# dtype, ctype, table_type, npy_dtype -dtypes = [('complex128', 'khcomplex128_t', 'complex128', 'complex128'), - ('complex64', 'khcomplex64_t', 'complex64', 'complex64'), - ('float64', 'float64_t', 'float64', 'float64'), - ('float32', 'float32_t', 'float32', 'float32'), - ('int64', 'int64_t', 'int64', 'int64'), - ('int32', 'int32_t', 'int32', 'int32'), - ('int16', 'int16_t', 'int16', 'int16'), - ('int8', 'int8_t', 'int8', 'int8'), - ('uint64', 'uint64_t', 'uint64', 'uint64'), - ('uint32', 'uint32_t', 'uint32', 'uint32'), - ('uint16', 'uint16_t', 'uint16', 'uint16'), - ('uint8', 'uint8_t', 'uint8', 'uint8'), - ('object', 'object', 'pymap', 'object_')] -}} - -{{for dtype, ctype, table_type, npy_dtype in dtypes}} - @cython.wraparound(False) @cython.boundscheck(False) - {{if dtype == 'object'}} - - -def mode_{{dtype}}(ndarray[{{ctype}}] values, bint dropna): +def mode_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): {{else}} - - def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): {{endif}} cdef: - int count, max_count = 1 - int j = -1 # so you can do += - # Don't use Py_ssize_t, since table.n_buckets is unsigned - khiter_t k - kh_{{table_type}}_t *table - ndarray[{{ctype}}] modes + {{if dtype == 'object'}} + ndarray[{{dtype}}] keys + ndarray[{{dtype}}] modes + {{else}} + {{dtype}}_t[:] keys + ndarray[{{dtype}}_t] modes + {{endif}} + int64_t[:] counts + int64_t count, max_count = -1 + Py_ssize_t k, j = 0 - table = kh_init_{{table_type}}() - build_count_table_{{dtype}}(values, table, dropna) + keys, counts = value_count_{{dtype}}(values, dropna) - modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}}) + {{if dtype == 'object'}} + modes = np.empty(len(keys), dtype=np.object_) + {{else}} + modes = np.empty(len(keys), dtype=np.{{dtype}}) + {{endif}} {{if dtype != 'object'}} with nogil: - for k in range(table.n_buckets): - if kh_exist_{{table_type}}(table, k): - count = table.vals[k] - if count == max_count: - j += 1 - elif count > max_count: - max_count = count - j = 0 - else: - continue - - modes[j] = table.keys[k] - {{else}} - for k in range(table.n_buckets): - if kh_exist_{{table_type}}(table, k): - count = table.vals[k] - + for k in range(len(keys)): + count = counts[k] if count == max_count: j += 1 elif count > max_count: @@ -374,11 +292,21 @@ def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): else: continue - modes[j] = table.keys[k] + modes[j] = keys[k] + {{else}} + for k in range(len(keys)): + count = counts[k] + if count == max_count: + j += 1 + elif count > max_count: + max_count = count + j = 0 + else: + continue + + modes[j] = keys[k] {{endif}} - kh_destroy_{{table_type}}(table) - return modes[:j + 1] {{endfor}} diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 5bf652c206a5f..8393312004241 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -6,6 +6,7 @@ from pandas._libs import hashtable as ht +import pandas as pd import pandas._testing as tm @@ -323,6 +324,23 @@ def test_mode(self, dtype, type_suffix, writable): result = mode(values, False) assert result == 42 + def test_mode_stable(self, dtype, type_suffix, writable): + mode = get_ht_function("mode", type_suffix) + values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype) + values.flags.writeable = writable + keys = mode(values, False) + tm.assert_numpy_array_equal(keys, values) + + +def test_modes_with_nans(): + # GH39007 + values = np.array([True, pd.NA, np.nan], dtype=np.object_) + # pd.Na and np.nan will have the same representative: np.nan + # thus we have 2 nans and 1 True + modes = ht.mode_object(values, False) + assert modes.size == 1 + assert np.isnan(modes[0]) + @pytest.mark.parametrize( "dtype, type_suffix",