ENH: making value_counts stable/keeping original ordering (#39009)

realead · web-flow · commit 7e531e3dfabd · 2021-01-22T16:42:33.000-05:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -358,6 +358,7 @@ Reshaping
 - Bug in :func:`join` over :class:`MultiIndex` returned wrong result, when one of both indexes had only one level (:issue:`36909`)
 - :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`)
 - Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`)
+- :meth:`Series.value_counts` returns keys in original order (:issue:`12679`, :issue:`11227`)
 - Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`)
 -
 
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -19,13 +19,6 @@ cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil:
     res.imag = val.imag
     return res
 
-
-cdef {{name}}_t to_{{name}}(kh{{name}}_t val) nogil:
-    cdef {{name}}_t res
-    res.real = val.real
-    res.imag = val.imag
-    return res
-
 {{endfor}}
 
 
diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -6,26 +6,26 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 
 {{py:
 
-# dtype, ttype, c_type, to_c_type, to_dtype
-dtypes = [('complex128', 'complex128', 'khcomplex128_t',
-                         'to_khcomplex128_t', 'to_complex128'),
-          ('complex64', 'complex64', 'khcomplex64_t',
-                        'to_khcomplex64_t', 'to_complex64'),
-          ('float64', 'float64', 'float64_t', '', ''),
-          ('float32', 'float32', 'float32_t', '', ''),
-          ('uint64', 'uint64', 'uint64_t', '', ''),
-          ('uint32', 'uint32', 'uint32_t', '', ''),
-          ('uint16', 'uint16', 'uint16_t', '', ''),
-          ('uint8', 'uint8', 'uint8_t', '', ''),
-          ('object', 'pymap', 'object', '', ''),
-          ('int64', 'int64', 'int64_t', '', ''),
-          ('int32', 'int32', 'int32_t', '', ''),
-          ('int16', 'int16', 'int16_t', '', ''),
-          ('int8', 'int8', 'int8_t', '', '')]
+# name, dtype, ttype, c_type, to_c_type
+dtypes = [('Complex128', 'complex128', 'complex128',
+                         'khcomplex128_t', 'to_khcomplex128_t'),
+          ('Complex64', 'complex64', 'complex64',
+                        'khcomplex64_t', 'to_khcomplex64_t'),
+          ('Float64', 'float64', 'float64', 'float64_t', ''),
+          ('Float32', 'float32', 'float32', 'float32_t', ''),
+          ('UInt64', 'uint64', 'uint64', 'uint64_t', ''),
+          ('UInt32', 'uint32', 'uint32', 'uint32_t', ''),
+          ('UInt16', 'uint16', 'uint16', 'uint16_t', ''),
+          ('UInt8', 'uint8', 'uint8', 'uint8_t', ''),
+          ('Object', 'object', 'pymap', 'object', ''),
+          ('Int64', 'int64', 'int64', 'int64_t', ''),
+          ('Int32', 'int32', 'int32', 'int32_t', ''),
+          ('Int16', 'int16', 'int16', 'int16_t', ''),
+          ('Int8', 'int8', 'int8', 'int8_t', '')]
 
 }}
 
-{{for dtype, ttype, c_type, to_c_type, to_dtype in dtypes}}
+{{for name, dtype, ttype, c_type, to_c_type in dtypes}}
 
 
 @cython.wraparound(False)
@@ -77,54 +77,77 @@ cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values,
 @cython.wraparound(False)
 @cython.boundscheck(False)
 {{if dtype == 'object'}}
-cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
+cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN):
 {{else}}
 cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
 {{endif}}
     cdef:
         Py_ssize_t i = 0
+        Py_ssize_t n = len(values)
+        size_t unique_key_index = 0
+        size_t unique_key_count = 0
         kh_{{ttype}}_t *table
 
-        {{if dtype != 'object'}}
-        {{dtype}}_t[:] result_keys
-        int64_t[:] result_counts
-        {{endif}}
-
         # Don't use Py_ssize_t, since table.n_buckets is unsigned
         khiter_t k
+        bint is_null
+
+        {{c_type}} val
+
+        int ret = 0
+
+    # we track the order in which keys are first seen (GH39009),
+    # khash-map isn't insertion-ordered, thus:
+    #    table maps key to index_of_appearence
+    #    result_keys maps index_of_appearence to key
+    #    result_counts maps index_of_appearence to number of elements
 
+    result_keys = {{name}}Vector()
+    result_counts = Int64Vector()
     table = kh_init_{{ttype}}()
+
     {{if dtype == 'object'}}
-    build_count_table_{{dtype}}(values, table, 1)
+    kh_resize_{{ttype}}(table, n // 10)
+
+    for i in range(n):
+        val = values[i]
+        is_null = checknull(val)
+        if not is_null or not dropna:
+            # all nas become the same representative:
+            if is_null:
+                val = navalue
+            k = kh_get_{{ttype}}(table, <PyObject*>val)
+            if k != table.n_buckets:
+                unique_key_index = table.vals[k]
+                result_counts.data.data[unique_key_index] += 1
+            else:
+                k = kh_put_{{ttype}}(table, <PyObject*>val, &ret)
+                table.vals[k] = unique_key_count
+                result_keys.append(val)
+                result_counts.append(1)
+                unique_key_count+=1
     {{else}}
-    build_count_table_{{dtype}}(values, table, dropna)
-    {{endif}}
+    kh_resize_{{ttype}}(table, n)
 
-    result_keys = np.empty(table.n_occupied, '{{dtype}}')
-    result_counts = np.zeros(table.n_occupied, dtype=np.int64)
+    for i in range(n):
+        val = {{to_c_type}}(values[i])
 
-    {{if dtype == 'object'}}
-    for k in range(table.n_buckets):
-        if kh_exist_{{ttype}}(table, k):
-            result_keys[i] = <{{dtype}}>table.keys[k]
-            result_counts[i] = table.vals[k]
-            i += 1
-    {{else}}
-    with nogil:
-        for k in range(table.n_buckets):
-            if kh_exist_{{ttype}}(table, k):
-                result_keys[i] = {{to_dtype}}(table.keys[k])
-                result_counts[i] = table.vals[k]
-                i += 1
+        if not is_nan_{{c_type}}(val) or not dropna:
+            k = kh_get_{{ttype}}(table, val)
+            if k != table.n_buckets:
+                unique_key_index = table.vals[k]
+                result_counts.data.data[unique_key_index] += 1
+            else:
+                k = kh_put_{{ttype}}(table, val, &ret)
+                table.vals[k] = unique_key_count
+                result_keys.append(val)
+                result_counts.append(1)
+                unique_key_count+=1
     {{endif}}
 
     kh_destroy_{{ttype}}(table)
 
-    {{if dtype == 'object'}}
-    return result_keys, result_counts
-    {{else}}
-    return np.asarray(result_keys), np.asarray(result_counts)
-    {{endif}}
+    return result_keys.to_array(), result_counts.to_array()
 
 
 @cython.wraparound(False)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -866,11 +866,6 @@ def value_counts_arraylike(values, dropna: bool):
         f = getattr(htable, f"value_count_{ndtype}")
         keys, counts = f(values, dropna)
 
-        mask = isna(values)
-        if not dropna and mask.any() and not isna(keys).any():
-            keys = np.insert(keys, 0, np.NaN)
-            counts = np.insert(counts, 0, mask.sum())
-
     keys = _reconstruct_data(keys, original.dtype, original)
 
     return keys, counts
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -1004,9 +1004,9 @@ def value_counts(
         >>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
         >>> index.value_counts()
         3.0    2
+        1.0    1
         2.0    1
         4.0    1
-        1.0    1
         dtype: int64
 
         With `normalize` set to `True`, returns the relative frequency by
@@ -1015,9 +1015,9 @@ def value_counts(
         >>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
         >>> s.value_counts(normalize=True)
         3.0    0.4
+        1.0    0.2
         2.0    0.2
         4.0    0.2
-        1.0    0.2
         dtype: float64
 
         **bins**
@@ -1039,10 +1039,10 @@ def value_counts(
 
         >>> s.value_counts(dropna=False)
         3.0    2
+        1.0    1
         2.0    1
-        NaN    1
         4.0    1
-        1.0    1
+        NaN    1
         dtype: int64
         """
         return value_counts(
diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py
@@ -77,18 +77,18 @@ def test_ufunc_reduce_raises(values):
 def test_value_counts_na():
     arr = pd.array([True, False, pd.NA], dtype="boolean")
     result = arr.value_counts(dropna=False)
-    expected = pd.Series([1, 1, 1], index=[False, True, pd.NA], dtype="Int64")
+    expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64")
     tm.assert_series_equal(result, expected)
 
     result = arr.value_counts(dropna=True)
-    expected = pd.Series([1, 1], index=[False, True], dtype="Int64")
+    expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
     tm.assert_series_equal(result, expected)
 
 
 def test_value_counts_with_normalize():
     s = pd.Series([True, False, pd.NA], dtype="boolean")
     result = s.value_counts(normalize=True)
-    expected = pd.Series([1, 1], index=[False, True], dtype="Float64") / 2
+    expected = pd.Series([1, 1], index=[True, False], dtype="Float64") / 2
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -497,7 +497,7 @@ def test_value_counts_na(dtype, request):
 
     arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
     result = arr.value_counts(dropna=False)
-    expected = pd.Series([2, 1, 1], index=["a", pd.NA, "b"], dtype="Int64")
+    expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64")
     tm.assert_series_equal(result, expected)
 
     result = arr.value_counts(dropna=True)
diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py
@@ -288,7 +288,7 @@ def test_value_counts_preserves_tz(self):
 
         arr[-2] = pd.NaT
         result = arr.value_counts()
-        expected = pd.Series([1, 4, 2], index=[pd.NaT, dti[0], dti[1]])
+        expected = pd.Series([4, 2, 1], index=[dti[0], dti[1], pd.NaT])
         tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize("method", ["pad", "backfill"])
diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py
@@ -371,7 +371,7 @@ def test_describe_does_not_raise_error_for_dictlike_elements(self):
         # GH#32409
         df = DataFrame([{"test": {"a": "1"}}, {"test": {"a": "2"}}])
         expected = DataFrame(
-            {"test": [2, 2, {"a": "2"}, 1]}, index=["count", "unique", "top", "freq"]
+            {"test": [2, 2, {"a": "1"}, 1]}, index=["count", "unique", "top", "freq"]
         )
         result = df.describe()
         tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py
@@ -272,6 +272,15 @@ def test_value_count(self, dtype, type_suffix, writable):
         tm.assert_numpy_array_equal(np.sort(keys), expected)
         assert np.all(counts == 5)
 
+    def test_value_count_stable(self, dtype, type_suffix, writable):
+        # GH12679
+        value_count = get_ht_function("value_count", type_suffix)
+        values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
+        values.flags.writeable = writable
+        keys, counts = value_count(values, False)
+        tm.assert_numpy_array_equal(keys, values)
+        assert np.all(counts == 1)
+
     def test_duplicated_first(self, dtype, type_suffix, writable):
         N = 100
         duplicated = get_ht_function("duplicated", type_suffix)
diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py
@@ -185,7 +185,7 @@ def test_value_counts_categorical_with_nan(self):
             (
                 Series([False, True, True, pd.NA]),
                 False,
-                Series([2, 1, 1], index=[True, pd.NA, False]),
+                Series([2, 1, 1], index=[True, False, pd.NA]),
             ),
             (
                 Series([False, True, True, pd.NA]),
@@ -195,7 +195,7 @@ def test_value_counts_categorical_with_nan(self):
             (
                 Series(range(3), index=[True, False, np.nan]).index,
                 False,
-                Series([1, 1, 1], index=[pd.NA, False, True]),
+                Series([1, 1, 1], index=[True, False, np.nan]),
             ),
         ],
     )
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -6,7 +6,7 @@
 import pytest
 
 from pandas._libs import algos as libalgos, hashtable as ht
-from pandas.compat import IS64, np_array_datetime64_compat
+from pandas.compat import np_array_datetime64_compat
 import pandas.util._test_decorators as td
 
 from pandas.core.dtypes.common import (
@@ -1272,12 +1272,10 @@ def test_value_counts_uint64(self):
         tm.assert_series_equal(result, expected)
 
         arr = np.array([-1, 2 ** 63], dtype=object)
-        expected = Series([1, 1], index=[2 ** 63, -1])
+        expected = Series([1, 1], index=[-1, 2 ** 63])
         result = algos.value_counts(arr)
 
-        # 32-bit linux has a different ordering
-        if IS64:
-            tm.assert_series_equal(result, expected)
+        tm.assert_series_equal(result, expected)
 
 
 class TestDuplicated:

Original file line number	Diff line number	Diff line change
`@@ -358,6 +358,7 @@ Reshaping`
`358`	`358`	- Bug in :func:`join` over :class:`MultiIndex` returned wrong result, when one of both indexes had only one level (:issue:`36909`)
`359`	`359`	- :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`)
`360`	`360`	- Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`)
	`361`	+- :meth:`Series.value_counts` returns keys in original order (:issue:`12679`, :issue:`11227`)
`361`	`362`	- Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`)
`362`	`363`	`-`
`363`	`364`
Original file line number	Diff line number	Diff line change
`@@ -371,7 +371,7 @@ def test_describe_does_not_raise_error_for_dictlike_elements(self):`
`371`	`371`	`# GH#32409`
`372`	`372`	`df = DataFrame([{"test": {"a": "1"}}, {"test": {"a": "2"}}])`
`373`	`373`	`expected = DataFrame(`
`374`		`- {"test": [2, 2, {"a": "2"}, 1]}, index=["count", "unique", "top", "freq"]`
	`374`	`+ {"test": [2, 2, {"a": "1"}, 1]}, index=["count", "unique", "top", "freq"]`
`375`	`375`	`)`
`376`	`376`	`result = df.describe()`
`377`	`377`	`tm.assert_frame_equal(result, expected)`
Original file line number	Diff line number	Diff line change
`@@ -185,7 +185,7 @@ def test_value_counts_categorical_with_nan(self):`
`185`	`185`	`(`
`186`	`186`	`Series([False, True, True, pd.NA]),`
`187`	`187`	`False,`
`188`		`- Series([2, 1, 1], index=[True, pd.NA, False]),`
	`188`	`+ Series([2, 1, 1], index=[True, False, pd.NA]),`
`189`	`189`	`),`
`190`	`190`	`(`
`191`	`191`	`Series([False, True, True, pd.NA]),`
`@@ -195,7 +195,7 @@ def test_value_counts_categorical_with_nan(self):`
`195`	`195`	`(`
`196`	`196`	`Series(range(3), index=[True, False, np.nan]).index,`
`197`	`197`	`False,`
`198`		`- Series([1, 1, 1], index=[pd.NA, False, True]),`
	`198`	`+ Series([1, 1, 1], index=[True, False, np.nan]),`
`199`	`199`	`),`
`200`	`200`	`],`
`201`	`201`	`)`