From 18050cb897e75ed430ddb8440d061244f81f08ad Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 5 Jan 2021 01:04:19 +0100 Subject: [PATCH 01/14] introducing stable value_counts function --- pandas/_libs/hashtable_func_helper.pxi.in | 85 +++++++++++++++-------- 1 file changed, 55 insertions(+), 30 deletions(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index f8f541235dcb7..0335acb86591b 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -77,54 +77,79 @@ cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values, @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): +cpdef stable_value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): {{else}} -cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): +cpdef stable_value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): {{endif}} cdef: Py_ssize_t i = 0 + Py_ssize_t n = len(values) + size_t unique_key_index = 0 + size_t unique_key_count = 0 kh_{{ttype}}_t *table - {{if dtype != 'object'}} - {{dtype}}_t[:] result_keys - int64_t[:] result_counts - {{endif}} - # Don't use Py_ssize_t, since table.n_buckets is unsigned khiter_t k - table = kh_init_{{ttype}}() - {{if dtype == 'object'}} - build_count_table_{{dtype}}(values, table, 1) + {{c_type}} val + + int ret = 0 + + {{if dtype[0]!='u'}} + result_keys = {{dtype.title()}}Vector() {{else}} - build_count_table_{{dtype}}(values, table, dropna) + result_keys = {{'U'+dtype[1::].title()}}Vector() {{endif}} - - result_keys = np.empty(table.n_occupied, '{{dtype}}') - result_counts = np.zeros(table.n_occupied, dtype=np.int64) + result_counts = Int64Vector() + table = kh_init_{{ttype}}() {{if dtype == 'object'}} - for k in range(table.n_buckets): - if kh_exist_{{ttype}}(table, k): - result_keys[i] = <{{dtype}}>table.keys[k] - result_counts[i] = table.vals[k] - i += 1 + kh_resize_{{ttype}}(table, n // 10) + + for i in range(n): + val = values[i] + if not checknull(val) or not dropna: + k = kh_get_{{ttype}}(table, val) + if k != table.n_buckets: + unique_key_index = table.vals[k] + result_counts.data.data[unique_key_index] += 1 + else: + k = kh_put_{{ttype}}(table, val, &ret) + table.vals[k] = unique_key_count + result_keys.append(val) + result_counts.append(1) + unique_key_count+=1 {{else}} - with nogil: - for k in range(table.n_buckets): - if kh_exist_{{ttype}}(table, k): - result_keys[i] = {{to_dtype}}(table.keys[k]) - result_counts[i] = table.vals[k] - i += 1 + kh_resize_{{ttype}}(table, n) + + for i in range(n): + val = {{to_c_type}}(values[i]) + + if not is_nan_{{c_type}}(val) or not dropna: + k = kh_get_{{ttype}}(table, val) + if k != table.n_buckets: + unique_key_index = table.vals[k] + result_counts.data.data[unique_key_index] += 1 + else: + k = kh_put_{{ttype}}(table, val, &ret) + table.vals[k] = unique_key_count + result_keys.append(val) + result_counts.append(1) + unique_key_count+=1 {{endif}} kh_destroy_{{ttype}}(table) - {{if dtype == 'object'}} - return result_keys, result_counts - {{else}} - return np.asarray(result_keys), np.asarray(result_counts) - {{endif}} + return result_keys.to_array(), result_counts.to_array() + + +{{if dtype == 'object'}} +cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): + return stable_value_count_{{dtype}}(values, 1) +{{else}} +cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): + return stable_value_count_{{dtype}}(values, dropna) +{{endif}} @cython.wraparound(False) From bfa7e570e4db2ec85dd6ac8070f661329136186e Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 5 Jan 2021 23:17:05 +0100 Subject: [PATCH 02/14] cleaning up: remove no longer needed functions --- pandas/_libs/hashtable_class_helper.pxi.in | 7 ----- pandas/_libs/hashtable_func_helper.pxi.in | 30 +++++++++++----------- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 276f162545399..a3e72ed858392 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -19,13 +19,6 @@ cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil: res.imag = val.imag return res - -cdef {{name}}_t to_{{name}}(kh{{name}}_t val) nogil: - cdef {{name}}_t res - res.real = val.real - res.imag = val.imag - return res - {{endfor}} diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 0335acb86591b..9c4c4fea22c5b 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -6,26 +6,26 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# dtype, ttype, c_type, to_c_type, to_dtype +# dtype, ttype, c_type, to_c_type dtypes = [('complex128', 'complex128', 'khcomplex128_t', - 'to_khcomplex128_t', 'to_complex128'), + 'to_khcomplex128_t'), ('complex64', 'complex64', 'khcomplex64_t', - 'to_khcomplex64_t', 'to_complex64'), - ('float64', 'float64', 'float64_t', '', ''), - ('float32', 'float32', 'float32_t', '', ''), - ('uint64', 'uint64', 'uint64_t', '', ''), - ('uint32', 'uint32', 'uint32_t', '', ''), - ('uint16', 'uint16', 'uint16_t', '', ''), - ('uint8', 'uint8', 'uint8_t', '', ''), - ('object', 'pymap', 'object', '', ''), - ('int64', 'int64', 'int64_t', '', ''), - ('int32', 'int32', 'int32_t', '', ''), - ('int16', 'int16', 'int16_t', '', ''), - ('int8', 'int8', 'int8_t', '', '')] + 'to_khcomplex64_t'), + ('float64', 'float64', 'float64_t', ''), + ('float32', 'float32', 'float32_t', ''), + ('uint64', 'uint64', 'uint64_t', ''), + ('uint32', 'uint32', 'uint32_t', ''), + ('uint16', 'uint16', 'uint16_t', ''), + ('uint8', 'uint8', 'uint8_t', ''), + ('object', 'pymap', 'object', ''), + ('int64', 'int64', 'int64_t', ''), + ('int32', 'int32', 'int32_t', ''), + ('int16', 'int16', 'int16_t', ''), + ('int8', 'int8', 'int8_t', '')] }} -{{for dtype, ttype, c_type, to_c_type, to_dtype in dtypes}} +{{for dtype, ttype, c_type, to_c_type in dtypes}} @cython.wraparound(False) From 773774b0dae2fa385b3a8e28b053be284f2f78c7 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 5 Jan 2021 23:23:11 +0100 Subject: [PATCH 03/14] the order of keys is stable now (the same as in the input), fix test cases --- pandas/tests/arrays/boolean/test_function.py | 6 +++--- pandas/tests/arrays/test_datetimes.py | 2 +- pandas/tests/series/methods/test_value_counts.py | 2 +- pandas/tests/test_algos.py | 8 +++----- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index 0f8743489b412..d90655b6e2820 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -77,18 +77,18 @@ def test_ufunc_reduce_raises(values): def test_value_counts_na(): arr = pd.array([True, False, pd.NA], dtype="boolean") result = arr.value_counts(dropna=False) - expected = pd.Series([1, 1, 1], index=[False, True, pd.NA], dtype="Int64") + expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([1, 1], index=[False, True], dtype="Int64") + expected = pd.Series([1, 1], index=[True, False], dtype="Int64") tm.assert_series_equal(result, expected) def test_value_counts_with_normalize(): s = pd.Series([True, False, pd.NA], dtype="boolean") result = s.value_counts(normalize=True) - expected = pd.Series([1, 1], index=[False, True], dtype="Float64") / 2 + expected = pd.Series([1, 1], index=[True, False], dtype="Float64") / 2 tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index ea44e5d477fc6..587d3c466c631 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -288,7 +288,7 @@ def test_value_counts_preserves_tz(self): arr[-2] = pd.NaT result = arr.value_counts() - expected = pd.Series([1, 4, 2], index=[pd.NaT, dti[0], dti[1]]) + expected = pd.Series([4, 2, 1], index=[dti[0], dti[1], pd.NaT]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("method", ["pad", "backfill"]) diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index f22b1be672190..2bed9940d1ebd 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -195,7 +195,7 @@ def test_value_counts_categorical_with_nan(self): ( Series(range(3), index=[True, False, np.nan]).index, False, - Series([1, 1, 1], index=[pd.NA, False, True]), + Series([1, 1, 1], index=[np.nan, True, False]), ), ], ) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index fb982c02acd99..88757b96085aa 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -6,7 +6,7 @@ import pytest from pandas._libs import algos as libalgos, hashtable as ht -from pandas.compat import IS64, np_array_datetime64_compat +from pandas.compat import np_array_datetime64_compat import pandas.util._test_decorators as td from pandas.core.dtypes.common import ( @@ -1272,12 +1272,10 @@ def test_value_counts_uint64(self): tm.assert_series_equal(result, expected) arr = np.array([-1, 2 ** 63], dtype=object) - expected = Series([1, 1], index=[2 ** 63, -1]) + expected = Series([1, 1], index=[-1, 2 ** 63]) result = algos.value_counts(arr) - # 32-bit linux has a different ordering - if IS64: - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) class TestDuplicated: From d96118f89a42def6ebebc607a06633fdc798ad2d Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 5 Jan 2021 23:28:42 +0100 Subject: [PATCH 04/14] adding a test testing explicitly for stability --- pandas/tests/libs/test_hashtable.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 4f650807afd30..ec01009b23954 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -272,6 +272,14 @@ def test_value_count(self, dtype, type_suffix, writable): tm.assert_numpy_array_equal(np.sort(keys), expected) assert np.all(counts == 5) + def test_value_count_stable(self, dtype, type_suffix, writable): + value_count = get_ht_function("value_count", type_suffix) + values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype) + values.flags.writeable = writable + keys, counts = value_count(values, False) + tm.assert_numpy_array_equal(keys, values) + assert np.all(counts == 1) + def test_duplicated_first(self, dtype, type_suffix, writable): N = 100 duplicated = get_ht_function("duplicated", type_suffix) From ca68f30cc19a3f83c685d68084ad7aa8f6a4b187 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Wed, 6 Jan 2021 00:11:58 +0100 Subject: [PATCH 05/14] fixing docstring tests --- pandas/core/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 631f67ced77dd..09e2e80f45b3d 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1004,9 +1004,9 @@ def value_counts( >>> index = pd.Index([3, 1, 2, 3, 4, np.nan]) >>> index.value_counts() 3.0 2 + 1.0 1 2.0 1 4.0 1 - 1.0 1 dtype: int64 With `normalize` set to `True`, returns the relative frequency by @@ -1015,9 +1015,9 @@ def value_counts( >>> s = pd.Series([3, 1, 2, 3, 4, np.nan]) >>> s.value_counts(normalize=True) 3.0 0.4 + 1.0 0.2 2.0 0.2 4.0 0.2 - 1.0 0.2 dtype: float64 **bins** @@ -1039,10 +1039,10 @@ def value_counts( >>> s.value_counts(dropna=False) 3.0 2 + 1.0 1 2.0 1 - NaN 1 4.0 1 - 1.0 1 + NaN 1 dtype: int64 """ return value_counts( From 7fd91530b616a13645ab0e4b74912b81d2bbf902 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 8 Jan 2021 22:14:22 +0100 Subject: [PATCH 06/14] keep original order also for null-objects for dtype=np.object --- pandas/_libs/hashtable_func_helper.pxi.in | 19 +++++++------------ pandas/core/algorithms.py | 5 ----- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 9c4c4fea22c5b..31e67b8bd57a4 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -77,9 +77,9 @@ cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values, @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -cpdef stable_value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): +cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN): {{else}} -cpdef stable_value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): +cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): {{endif}} cdef: Py_ssize_t i = 0 @@ -90,6 +90,7 @@ cpdef stable_value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): # Don't use Py_ssize_t, since table.n_buckets is unsigned khiter_t k + bint is_null {{c_type}} val @@ -108,7 +109,10 @@ cpdef stable_value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): for i in range(n): val = values[i] - if not checknull(val) or not dropna: + is_null = checknull(val) + if is_null: + val = navalue + if not is_null or not dropna: k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: unique_key_index = table.vals[k] @@ -143,15 +147,6 @@ cpdef stable_value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): return result_keys.to_array(), result_counts.to_array() -{{if dtype == 'object'}} -cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): - return stable_value_count_{{dtype}}(values, 1) -{{else}} -cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): - return stable_value_count_{{dtype}}(values, dropna) -{{endif}} - - @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ed7ae75117c5c..968b39088e684 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -866,11 +866,6 @@ def value_counts_arraylike(values, dropna: bool): f = getattr(htable, f"value_count_{ndtype}") keys, counts = f(values, dropna) - mask = isna(values) - if not dropna and mask.any() and not isna(keys).any(): - keys = np.insert(keys, 0, np.NaN) - counts = np.insert(counts, 0, mask.sum()) - keys = _reconstruct_data(keys, original.dtype, original) return keys, counts From 9962c3a1c0c2b38dff57b22044715ca6746f0cb2 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 8 Jan 2021 22:20:15 +0100 Subject: [PATCH 07/14] fixing test cases where the position of NaN has changed --- pandas/tests/arrays/string_/test_string.py | 2 +- pandas/tests/series/methods/test_value_counts.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 5365929213503..d14de990d8268 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -497,7 +497,7 @@ def test_value_counts_na(dtype, request): arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=["a", pd.NA, "b"], dtype="Int64") + expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index 2bed9940d1ebd..505b879660ff1 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -185,7 +185,7 @@ def test_value_counts_categorical_with_nan(self): ( Series([False, True, True, pd.NA]), False, - Series([2, 1, 1], index=[True, pd.NA, False]), + Series([2, 1, 1], index=[True, False, pd.NA]), ), ( Series([False, True, True, pd.NA]), @@ -195,7 +195,7 @@ def test_value_counts_categorical_with_nan(self): ( Series(range(3), index=[True, False, np.nan]).index, False, - Series([1, 1, 1], index=[np.nan, True, False]), + Series([1, 1, 1], index=[True, False, np.nan]), ), ], ) From 679044ba2437c8ff2595182347a45381fb371406 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Thu, 14 Jan 2021 22:21:31 +0100 Subject: [PATCH 08/14] introduce name rather than creating XXXVector-names --- pandas/_libs/hashtable_func_helper.pxi.in | 40 ++++++++++------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 31e67b8bd57a4..aead3a2e97dd4 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -6,26 +6,26 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# dtype, ttype, c_type, to_c_type -dtypes = [('complex128', 'complex128', 'khcomplex128_t', - 'to_khcomplex128_t'), - ('complex64', 'complex64', 'khcomplex64_t', - 'to_khcomplex64_t'), - ('float64', 'float64', 'float64_t', ''), - ('float32', 'float32', 'float32_t', ''), - ('uint64', 'uint64', 'uint64_t', ''), - ('uint32', 'uint32', 'uint32_t', ''), - ('uint16', 'uint16', 'uint16_t', ''), - ('uint8', 'uint8', 'uint8_t', ''), - ('object', 'pymap', 'object', ''), - ('int64', 'int64', 'int64_t', ''), - ('int32', 'int32', 'int32_t', ''), - ('int16', 'int16', 'int16_t', ''), - ('int8', 'int8', 'int8_t', '')] +# name, dtype, ttype, c_type, to_c_type +dtypes = [('Complex128', 'complex128', 'complex128', + 'khcomplex128_t', 'to_khcomplex128_t'), + ('Complex64', 'complex64', 'complex64', + 'khcomplex64_t', 'to_khcomplex64_t'), + ('Float64', 'float64', 'float64', 'float64_t', ''), + ('Float32', 'float32', 'float32', 'float32_t', ''), + ('UInt64', 'uint64', 'uint64', 'uint64_t', ''), + ('UInt32', 'uint32', 'uint32', 'uint32_t', ''), + ('UInt16', 'uint16', 'uint16', 'uint16_t', ''), + ('UInt8', 'uint8', 'uint8', 'uint8_t', ''), + ('Object', 'object', 'pymap', 'object', ''), + ('Int64', 'int64', 'int64', 'int64_t', ''), + ('Int32', 'int32', 'int32', 'int32_t', ''), + ('Int16', 'int16', 'int16', 'int16_t', ''), + ('Int8', 'int8', 'int8', 'int8_t', '')] }} -{{for dtype, ttype, c_type, to_c_type in dtypes}} +{{for name, dtype, ttype, c_type, to_c_type in dtypes}} @cython.wraparound(False) @@ -96,11 +96,7 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): int ret = 0 - {{if dtype[0]!='u'}} - result_keys = {{dtype.title()}}Vector() - {{else}} - result_keys = {{'U'+dtype[1::].title()}}Vector() - {{endif}} + result_keys = {{name}}Vector() result_counts = Int64Vector() table = kh_init_{{ttype}}() From 8097e5b73300c2eadb6a7df9ba9a4b9c8ea3b29c Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Thu, 14 Jan 2021 22:30:22 +0100 Subject: [PATCH 09/14] adding explanation --- pandas/_libs/hashtable_func_helper.pxi.in | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index aead3a2e97dd4..b236a8db5f63f 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -96,6 +96,12 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): int ret = 0 + # we track the order in which keys are first seen (GH39009), + # khash-map isn't insertion-ordered, thus: + # table maps key to index_of_appearence + # result_keys maps index_of_appearence to key + # result_counts maps index_of_appearence to number of elements + result_keys = {{name}}Vector() result_counts = Int64Vector() table = kh_init_{{ttype}}() From 95f2247cc04a20db42dd5d47c375dbd46358b99b Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Thu, 14 Jan 2021 22:37:12 +0100 Subject: [PATCH 10/14] adding issue-id to comment --- pandas/tests/libs/test_hashtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index ec01009b23954..5bf652c206a5f 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -273,6 +273,7 @@ def test_value_count(self, dtype, type_suffix, writable): assert np.all(counts == 5) def test_value_count_stable(self, dtype, type_suffix, writable): + # GH12679 value_count = get_ht_function("value_count", type_suffix) values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype) values.flags.writeable = writable From d480b7c8777f29862c92dc95f4f049788eb6f587 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Thu, 14 Jan 2021 22:40:57 +0100 Subject: [PATCH 11/14] adding whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ff11ebc022ffb..af39e93e3b3d2 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -356,6 +356,7 @@ Reshaping - Bug in :func:`join` over :class:`MultiIndex` returned wrong result, when one of both indexes had only one level (:issue:`36909`) - :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`) - Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`) +- ``value_counts`` returns keys in original order (:issue:`12679`) - Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`) - From 90d0c523f464b3b72bfc7cf25102ec099a85035e Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 22 Jan 2021 07:14:56 +0100 Subject: [PATCH 12/14] fixing a newly created test --- pandas/tests/frame/methods/test_describe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index ed1c3fcce378c..15bafb7a835ba 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -371,7 +371,7 @@ def test_describe_does_not_raise_error_for_dictlike_elements(self): # GH#32409 df = DataFrame([{"test": {"a": "1"}}, {"test": {"a": "2"}}]) expected = DataFrame( - {"test": [2, 2, {"a": "2"}, 1]}, index=["count", "unique", "top", "freq"] + {"test": [2, 2, {"a": "1"}, 1]}, index=["count", "unique", "top", "freq"] ) result = df.describe() tm.assert_frame_equal(result, expected) From d85b1aa47fed1da8d92b3843bb2736816815aa9d Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 22 Jan 2021 15:56:47 +0100 Subject: [PATCH 13/14] improve whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index af39e93e3b3d2..2234b870ac9c0 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -356,7 +356,7 @@ Reshaping - Bug in :func:`join` over :class:`MultiIndex` returned wrong result, when one of both indexes had only one level (:issue:`36909`) - :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`) - Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`) -- ``value_counts`` returns keys in original order (:issue:`12679`) +- :meth:`Series.value_counts` returns keys in original order (:issue:`12679`, :issue:`11227`) - Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`) - From 230215fe6dccf5cbd2bee427eecf342139dfd037 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 22 Jan 2021 16:03:57 +0100 Subject: [PATCH 14/14] reorder ifs --- pandas/_libs/hashtable_func_helper.pxi.in | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index b236a8db5f63f..b4da5a3c7fb09 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -112,9 +112,10 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): for i in range(n): val = values[i] is_null = checknull(val) - if is_null: - val = navalue if not is_null or not dropna: + # all nas become the same representative: + if is_null: + val = navalue k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: unique_key_index = table.vals[k]