diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ff11ebc022ffb..2234b870ac9c0 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -356,6 +356,7 @@ Reshaping - Bug in :func:`join` over :class:`MultiIndex` returned wrong result, when one of both indexes had only one level (:issue:`36909`) - :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`) - Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`) +- :meth:`Series.value_counts` returns keys in original order (:issue:`12679`, :issue:`11227`) - Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`) - diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 276f162545399..a3e72ed858392 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -19,13 +19,6 @@ cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil: res.imag = val.imag return res - -cdef {{name}}_t to_{{name}}(kh{{name}}_t val) nogil: - cdef {{name}}_t res - res.real = val.real - res.imag = val.imag - return res - {{endfor}} diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index f8f541235dcb7..b4da5a3c7fb09 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -6,26 +6,26 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# dtype, ttype, c_type, to_c_type, to_dtype -dtypes = [('complex128', 'complex128', 'khcomplex128_t', - 'to_khcomplex128_t', 'to_complex128'), - ('complex64', 'complex64', 'khcomplex64_t', - 'to_khcomplex64_t', 'to_complex64'), - ('float64', 'float64', 'float64_t', '', ''), - ('float32', 'float32', 'float32_t', '', ''), - ('uint64', 'uint64', 'uint64_t', '', ''), - ('uint32', 'uint32', 'uint32_t', '', ''), - ('uint16', 'uint16', 'uint16_t', '', ''), - ('uint8', 'uint8', 'uint8_t', '', ''), - ('object', 'pymap', 'object', '', ''), - ('int64', 'int64', 'int64_t', '', ''), - ('int32', 'int32', 'int32_t', '', ''), - ('int16', 'int16', 'int16_t', '', ''), - ('int8', 'int8', 'int8_t', '', '')] +# name, dtype, ttype, c_type, to_c_type +dtypes = [('Complex128', 'complex128', 'complex128', + 'khcomplex128_t', 'to_khcomplex128_t'), + ('Complex64', 'complex64', 'complex64', + 'khcomplex64_t', 'to_khcomplex64_t'), + ('Float64', 'float64', 'float64', 'float64_t', ''), + ('Float32', 'float32', 'float32', 'float32_t', ''), + ('UInt64', 'uint64', 'uint64', 'uint64_t', ''), + ('UInt32', 'uint32', 'uint32', 'uint32_t', ''), + ('UInt16', 'uint16', 'uint16', 'uint16_t', ''), + ('UInt8', 'uint8', 'uint8', 'uint8_t', ''), + ('Object', 'object', 'pymap', 'object', ''), + ('Int64', 'int64', 'int64', 'int64_t', ''), + ('Int32', 'int32', 'int32', 'int32_t', ''), + ('Int16', 'int16', 'int16', 'int16_t', ''), + ('Int8', 'int8', 'int8', 'int8_t', '')] }} -{{for dtype, ttype, c_type, to_c_type, to_dtype in dtypes}} +{{for name, dtype, ttype, c_type, to_c_type in dtypes}} @cython.wraparound(False) @@ -77,54 +77,77 @@ cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values, @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): +cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN): {{else}} cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): {{endif}} cdef: Py_ssize_t i = 0 + Py_ssize_t n = len(values) + size_t unique_key_index = 0 + size_t unique_key_count = 0 kh_{{ttype}}_t *table - {{if dtype != 'object'}} - {{dtype}}_t[:] result_keys - int64_t[:] result_counts - {{endif}} - # Don't use Py_ssize_t, since table.n_buckets is unsigned khiter_t k + bint is_null + + {{c_type}} val + + int ret = 0 + + # we track the order in which keys are first seen (GH39009), + # khash-map isn't insertion-ordered, thus: + # table maps key to index_of_appearence + # result_keys maps index_of_appearence to key + # result_counts maps index_of_appearence to number of elements + result_keys = {{name}}Vector() + result_counts = Int64Vector() table = kh_init_{{ttype}}() + {{if dtype == 'object'}} - build_count_table_{{dtype}}(values, table, 1) + kh_resize_{{ttype}}(table, n // 10) + + for i in range(n): + val = values[i] + is_null = checknull(val) + if not is_null or not dropna: + # all nas become the same representative: + if is_null: + val = navalue + k = kh_get_{{ttype}}(table, val) + if k != table.n_buckets: + unique_key_index = table.vals[k] + result_counts.data.data[unique_key_index] += 1 + else: + k = kh_put_{{ttype}}(table, val, &ret) + table.vals[k] = unique_key_count + result_keys.append(val) + result_counts.append(1) + unique_key_count+=1 {{else}} - build_count_table_{{dtype}}(values, table, dropna) - {{endif}} + kh_resize_{{ttype}}(table, n) - result_keys = np.empty(table.n_occupied, '{{dtype}}') - result_counts = np.zeros(table.n_occupied, dtype=np.int64) + for i in range(n): + val = {{to_c_type}}(values[i]) - {{if dtype == 'object'}} - for k in range(table.n_buckets): - if kh_exist_{{ttype}}(table, k): - result_keys[i] = <{{dtype}}>table.keys[k] - result_counts[i] = table.vals[k] - i += 1 - {{else}} - with nogil: - for k in range(table.n_buckets): - if kh_exist_{{ttype}}(table, k): - result_keys[i] = {{to_dtype}}(table.keys[k]) - result_counts[i] = table.vals[k] - i += 1 + if not is_nan_{{c_type}}(val) or not dropna: + k = kh_get_{{ttype}}(table, val) + if k != table.n_buckets: + unique_key_index = table.vals[k] + result_counts.data.data[unique_key_index] += 1 + else: + k = kh_put_{{ttype}}(table, val, &ret) + table.vals[k] = unique_key_count + result_keys.append(val) + result_counts.append(1) + unique_key_count+=1 {{endif}} kh_destroy_{{ttype}}(table) - {{if dtype == 'object'}} - return result_keys, result_counts - {{else}} - return np.asarray(result_keys), np.asarray(result_counts) - {{endif}} + return result_keys.to_array(), result_counts.to_array() @cython.wraparound(False) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ed7ae75117c5c..968b39088e684 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -866,11 +866,6 @@ def value_counts_arraylike(values, dropna: bool): f = getattr(htable, f"value_count_{ndtype}") keys, counts = f(values, dropna) - mask = isna(values) - if not dropna and mask.any() and not isna(keys).any(): - keys = np.insert(keys, 0, np.NaN) - counts = np.insert(counts, 0, mask.sum()) - keys = _reconstruct_data(keys, original.dtype, original) return keys, counts diff --git a/pandas/core/base.py b/pandas/core/base.py index 631f67ced77dd..09e2e80f45b3d 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1004,9 +1004,9 @@ def value_counts( >>> index = pd.Index([3, 1, 2, 3, 4, np.nan]) >>> index.value_counts() 3.0 2 + 1.0 1 2.0 1 4.0 1 - 1.0 1 dtype: int64 With `normalize` set to `True`, returns the relative frequency by @@ -1015,9 +1015,9 @@ def value_counts( >>> s = pd.Series([3, 1, 2, 3, 4, np.nan]) >>> s.value_counts(normalize=True) 3.0 0.4 + 1.0 0.2 2.0 0.2 4.0 0.2 - 1.0 0.2 dtype: float64 **bins** @@ -1039,10 +1039,10 @@ def value_counts( >>> s.value_counts(dropna=False) 3.0 2 + 1.0 1 2.0 1 - NaN 1 4.0 1 - 1.0 1 + NaN 1 dtype: int64 """ return value_counts( diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index 0f8743489b412..d90655b6e2820 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -77,18 +77,18 @@ def test_ufunc_reduce_raises(values): def test_value_counts_na(): arr = pd.array([True, False, pd.NA], dtype="boolean") result = arr.value_counts(dropna=False) - expected = pd.Series([1, 1, 1], index=[False, True, pd.NA], dtype="Int64") + expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([1, 1], index=[False, True], dtype="Int64") + expected = pd.Series([1, 1], index=[True, False], dtype="Int64") tm.assert_series_equal(result, expected) def test_value_counts_with_normalize(): s = pd.Series([True, False, pd.NA], dtype="boolean") result = s.value_counts(normalize=True) - expected = pd.Series([1, 1], index=[False, True], dtype="Float64") / 2 + expected = pd.Series([1, 1], index=[True, False], dtype="Float64") / 2 tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 5365929213503..d14de990d8268 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -497,7 +497,7 @@ def test_value_counts_na(dtype, request): arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=["a", pd.NA, "b"], dtype="Int64") + expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index ea44e5d477fc6..587d3c466c631 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -288,7 +288,7 @@ def test_value_counts_preserves_tz(self): arr[-2] = pd.NaT result = arr.value_counts() - expected = pd.Series([1, 4, 2], index=[pd.NaT, dti[0], dti[1]]) + expected = pd.Series([4, 2, 1], index=[dti[0], dti[1], pd.NaT]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("method", ["pad", "backfill"]) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index ed1c3fcce378c..15bafb7a835ba 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -371,7 +371,7 @@ def test_describe_does_not_raise_error_for_dictlike_elements(self): # GH#32409 df = DataFrame([{"test": {"a": "1"}}, {"test": {"a": "2"}}]) expected = DataFrame( - {"test": [2, 2, {"a": "2"}, 1]}, index=["count", "unique", "top", "freq"] + {"test": [2, 2, {"a": "1"}, 1]}, index=["count", "unique", "top", "freq"] ) result = df.describe() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 4f650807afd30..5bf652c206a5f 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -272,6 +272,15 @@ def test_value_count(self, dtype, type_suffix, writable): tm.assert_numpy_array_equal(np.sort(keys), expected) assert np.all(counts == 5) + def test_value_count_stable(self, dtype, type_suffix, writable): + # GH12679 + value_count = get_ht_function("value_count", type_suffix) + values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype) + values.flags.writeable = writable + keys, counts = value_count(values, False) + tm.assert_numpy_array_equal(keys, values) + assert np.all(counts == 1) + def test_duplicated_first(self, dtype, type_suffix, writable): N = 100 duplicated = get_ht_function("duplicated", type_suffix) diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index f22b1be672190..505b879660ff1 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -185,7 +185,7 @@ def test_value_counts_categorical_with_nan(self): ( Series([False, True, True, pd.NA]), False, - Series([2, 1, 1], index=[True, pd.NA, False]), + Series([2, 1, 1], index=[True, False, pd.NA]), ), ( Series([False, True, True, pd.NA]), @@ -195,7 +195,7 @@ def test_value_counts_categorical_with_nan(self): ( Series(range(3), index=[True, False, np.nan]).index, False, - Series([1, 1, 1], index=[pd.NA, False, True]), + Series([1, 1, 1], index=[True, False, np.nan]), ), ], ) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index fb982c02acd99..88757b96085aa 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -6,7 +6,7 @@ import pytest from pandas._libs import algos as libalgos, hashtable as ht -from pandas.compat import IS64, np_array_datetime64_compat +from pandas.compat import np_array_datetime64_compat import pandas.util._test_decorators as td from pandas.core.dtypes.common import ( @@ -1272,12 +1272,10 @@ def test_value_counts_uint64(self): tm.assert_series_equal(result, expected) arr = np.array([-1, 2 ** 63], dtype=object) - expected = Series([1, 1], index=[2 ** 63, -1]) + expected = Series([1, 1], index=[-1, 2 ** 63]) result = algos.value_counts(arr) - # 32-bit linux has a different ordering - if IS64: - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) class TestDuplicated: