Skip to content

Commit 7e531e3

Browse files
authored
ENH: making value_counts stable/keeping original ordering (#39009)
1 parent 2f88321 commit 7e531e3

File tree

12 files changed

+94
-75
lines changed

12 files changed

+94
-75
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,7 @@ Reshaping
358358
- Bug in :func:`join` over :class:`MultiIndex` returned wrong result, when one of both indexes had only one level (:issue:`36909`)
359359
- :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`)
360360
- Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`)
361+
- :meth:`Series.value_counts` returns keys in original order (:issue:`12679`, :issue:`11227`)
361362
- Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`)
362363
-
363364

pandas/_libs/hashtable_class_helper.pxi.in

-7
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,6 @@ cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil:
1919
res.imag = val.imag
2020
return res
2121

22-
23-
cdef {{name}}_t to_{{name}}(kh{{name}}_t val) nogil:
24-
cdef {{name}}_t res
25-
res.real = val.real
26-
res.imag = val.imag
27-
return res
28-
2922
{{endfor}}
3023

3124

pandas/_libs/hashtable_func_helper.pxi.in

+69-46
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,26 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
66

77
{{py:
88

9-
# dtype, ttype, c_type, to_c_type, to_dtype
10-
dtypes = [('complex128', 'complex128', 'khcomplex128_t',
11-
'to_khcomplex128_t', 'to_complex128'),
12-
('complex64', 'complex64', 'khcomplex64_t',
13-
'to_khcomplex64_t', 'to_complex64'),
14-
('float64', 'float64', 'float64_t', '', ''),
15-
('float32', 'float32', 'float32_t', '', ''),
16-
('uint64', 'uint64', 'uint64_t', '', ''),
17-
('uint32', 'uint32', 'uint32_t', '', ''),
18-
('uint16', 'uint16', 'uint16_t', '', ''),
19-
('uint8', 'uint8', 'uint8_t', '', ''),
20-
('object', 'pymap', 'object', '', ''),
21-
('int64', 'int64', 'int64_t', '', ''),
22-
('int32', 'int32', 'int32_t', '', ''),
23-
('int16', 'int16', 'int16_t', '', ''),
24-
('int8', 'int8', 'int8_t', '', '')]
9+
# name, dtype, ttype, c_type, to_c_type
10+
dtypes = [('Complex128', 'complex128', 'complex128',
11+
'khcomplex128_t', 'to_khcomplex128_t'),
12+
('Complex64', 'complex64', 'complex64',
13+
'khcomplex64_t', 'to_khcomplex64_t'),
14+
('Float64', 'float64', 'float64', 'float64_t', ''),
15+
('Float32', 'float32', 'float32', 'float32_t', ''),
16+
('UInt64', 'uint64', 'uint64', 'uint64_t', ''),
17+
('UInt32', 'uint32', 'uint32', 'uint32_t', ''),
18+
('UInt16', 'uint16', 'uint16', 'uint16_t', ''),
19+
('UInt8', 'uint8', 'uint8', 'uint8_t', ''),
20+
('Object', 'object', 'pymap', 'object', ''),
21+
('Int64', 'int64', 'int64', 'int64_t', ''),
22+
('Int32', 'int32', 'int32', 'int32_t', ''),
23+
('Int16', 'int16', 'int16', 'int16_t', ''),
24+
('Int8', 'int8', 'int8', 'int8_t', '')]
2525

2626
}}
2727

28-
{{for dtype, ttype, c_type, to_c_type, to_dtype in dtypes}}
28+
{{for name, dtype, ttype, c_type, to_c_type in dtypes}}
2929

3030

3131
@cython.wraparound(False)
@@ -77,54 +77,77 @@ cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values,
7777
@cython.wraparound(False)
7878
@cython.boundscheck(False)
7979
{{if dtype == 'object'}}
80-
cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
80+
cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN):
8181
{{else}}
8282
cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
8383
{{endif}}
8484
cdef:
8585
Py_ssize_t i = 0
86+
Py_ssize_t n = len(values)
87+
size_t unique_key_index = 0
88+
size_t unique_key_count = 0
8689
kh_{{ttype}}_t *table
8790

88-
{{if dtype != 'object'}}
89-
{{dtype}}_t[:] result_keys
90-
int64_t[:] result_counts
91-
{{endif}}
92-
9391
# Don't use Py_ssize_t, since table.n_buckets is unsigned
9492
khiter_t k
93+
bint is_null
94+
95+
{{c_type}} val
96+
97+
int ret = 0
98+
99+
# we track the order in which keys are first seen (GH39009),
100+
# khash-map isn't insertion-ordered, thus:
101+
# table maps key to index_of_appearence
102+
# result_keys maps index_of_appearence to key
103+
# result_counts maps index_of_appearence to number of elements
95104

105+
result_keys = {{name}}Vector()
106+
result_counts = Int64Vector()
96107
table = kh_init_{{ttype}}()
108+
97109
{{if dtype == 'object'}}
98-
build_count_table_{{dtype}}(values, table, 1)
110+
kh_resize_{{ttype}}(table, n // 10)
111+
112+
for i in range(n):
113+
val = values[i]
114+
is_null = checknull(val)
115+
if not is_null or not dropna:
116+
# all nas become the same representative:
117+
if is_null:
118+
val = navalue
119+
k = kh_get_{{ttype}}(table, <PyObject*>val)
120+
if k != table.n_buckets:
121+
unique_key_index = table.vals[k]
122+
result_counts.data.data[unique_key_index] += 1
123+
else:
124+
k = kh_put_{{ttype}}(table, <PyObject*>val, &ret)
125+
table.vals[k] = unique_key_count
126+
result_keys.append(val)
127+
result_counts.append(1)
128+
unique_key_count+=1
99129
{{else}}
100-
build_count_table_{{dtype}}(values, table, dropna)
101-
{{endif}}
130+
kh_resize_{{ttype}}(table, n)
102131

103-
result_keys = np.empty(table.n_occupied, '{{dtype}}')
104-
result_counts = np.zeros(table.n_occupied, dtype=np.int64)
132+
for i in range(n):
133+
val = {{to_c_type}}(values[i])
105134

106-
{{if dtype == 'object'}}
107-
for k in range(table.n_buckets):
108-
if kh_exist_{{ttype}}(table, k):
109-
result_keys[i] = <{{dtype}}>table.keys[k]
110-
result_counts[i] = table.vals[k]
111-
i += 1
112-
{{else}}
113-
with nogil:
114-
for k in range(table.n_buckets):
115-
if kh_exist_{{ttype}}(table, k):
116-
result_keys[i] = {{to_dtype}}(table.keys[k])
117-
result_counts[i] = table.vals[k]
118-
i += 1
135+
if not is_nan_{{c_type}}(val) or not dropna:
136+
k = kh_get_{{ttype}}(table, val)
137+
if k != table.n_buckets:
138+
unique_key_index = table.vals[k]
139+
result_counts.data.data[unique_key_index] += 1
140+
else:
141+
k = kh_put_{{ttype}}(table, val, &ret)
142+
table.vals[k] = unique_key_count
143+
result_keys.append(val)
144+
result_counts.append(1)
145+
unique_key_count+=1
119146
{{endif}}
120147

121148
kh_destroy_{{ttype}}(table)
122149

123-
{{if dtype == 'object'}}
124-
return result_keys, result_counts
125-
{{else}}
126-
return np.asarray(result_keys), np.asarray(result_counts)
127-
{{endif}}
150+
return result_keys.to_array(), result_counts.to_array()
128151

129152

130153
@cython.wraparound(False)

pandas/core/algorithms.py

-5
Original file line numberDiff line numberDiff line change
@@ -866,11 +866,6 @@ def value_counts_arraylike(values, dropna: bool):
866866
f = getattr(htable, f"value_count_{ndtype}")
867867
keys, counts = f(values, dropna)
868868

869-
mask = isna(values)
870-
if not dropna and mask.any() and not isna(keys).any():
871-
keys = np.insert(keys, 0, np.NaN)
872-
counts = np.insert(counts, 0, mask.sum())
873-
874869
keys = _reconstruct_data(keys, original.dtype, original)
875870

876871
return keys, counts

pandas/core/base.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -1004,9 +1004,9 @@ def value_counts(
10041004
>>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
10051005
>>> index.value_counts()
10061006
3.0 2
1007+
1.0 1
10071008
2.0 1
10081009
4.0 1
1009-
1.0 1
10101010
dtype: int64
10111011
10121012
With `normalize` set to `True`, returns the relative frequency by
@@ -1015,9 +1015,9 @@ def value_counts(
10151015
>>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
10161016
>>> s.value_counts(normalize=True)
10171017
3.0 0.4
1018+
1.0 0.2
10181019
2.0 0.2
10191020
4.0 0.2
1020-
1.0 0.2
10211021
dtype: float64
10221022
10231023
**bins**
@@ -1039,10 +1039,10 @@ def value_counts(
10391039
10401040
>>> s.value_counts(dropna=False)
10411041
3.0 2
1042+
1.0 1
10421043
2.0 1
1043-
NaN 1
10441044
4.0 1
1045-
1.0 1
1045+
NaN 1
10461046
dtype: int64
10471047
"""
10481048
return value_counts(

pandas/tests/arrays/boolean/test_function.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -77,18 +77,18 @@ def test_ufunc_reduce_raises(values):
7777
def test_value_counts_na():
7878
arr = pd.array([True, False, pd.NA], dtype="boolean")
7979
result = arr.value_counts(dropna=False)
80-
expected = pd.Series([1, 1, 1], index=[False, True, pd.NA], dtype="Int64")
80+
expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64")
8181
tm.assert_series_equal(result, expected)
8282

8383
result = arr.value_counts(dropna=True)
84-
expected = pd.Series([1, 1], index=[False, True], dtype="Int64")
84+
expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
8585
tm.assert_series_equal(result, expected)
8686

8787

8888
def test_value_counts_with_normalize():
8989
s = pd.Series([True, False, pd.NA], dtype="boolean")
9090
result = s.value_counts(normalize=True)
91-
expected = pd.Series([1, 1], index=[False, True], dtype="Float64") / 2
91+
expected = pd.Series([1, 1], index=[True, False], dtype="Float64") / 2
9292
tm.assert_series_equal(result, expected)
9393

9494

pandas/tests/arrays/string_/test_string.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -497,7 +497,7 @@ def test_value_counts_na(dtype, request):
497497

498498
arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
499499
result = arr.value_counts(dropna=False)
500-
expected = pd.Series([2, 1, 1], index=["a", pd.NA, "b"], dtype="Int64")
500+
expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64")
501501
tm.assert_series_equal(result, expected)
502502

503503
result = arr.value_counts(dropna=True)

pandas/tests/arrays/test_datetimes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ def test_value_counts_preserves_tz(self):
288288

289289
arr[-2] = pd.NaT
290290
result = arr.value_counts()
291-
expected = pd.Series([1, 4, 2], index=[pd.NaT, dti[0], dti[1]])
291+
expected = pd.Series([4, 2, 1], index=[dti[0], dti[1], pd.NaT])
292292
tm.assert_series_equal(result, expected)
293293

294294
@pytest.mark.parametrize("method", ["pad", "backfill"])

pandas/tests/frame/methods/test_describe.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ def test_describe_does_not_raise_error_for_dictlike_elements(self):
371371
# GH#32409
372372
df = DataFrame([{"test": {"a": "1"}}, {"test": {"a": "2"}}])
373373
expected = DataFrame(
374-
{"test": [2, 2, {"a": "2"}, 1]}, index=["count", "unique", "top", "freq"]
374+
{"test": [2, 2, {"a": "1"}, 1]}, index=["count", "unique", "top", "freq"]
375375
)
376376
result = df.describe()
377377
tm.assert_frame_equal(result, expected)

pandas/tests/libs/test_hashtable.py

+9
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,15 @@ def test_value_count(self, dtype, type_suffix, writable):
272272
tm.assert_numpy_array_equal(np.sort(keys), expected)
273273
assert np.all(counts == 5)
274274

275+
def test_value_count_stable(self, dtype, type_suffix, writable):
276+
# GH12679
277+
value_count = get_ht_function("value_count", type_suffix)
278+
values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
279+
values.flags.writeable = writable
280+
keys, counts = value_count(values, False)
281+
tm.assert_numpy_array_equal(keys, values)
282+
assert np.all(counts == 1)
283+
275284
def test_duplicated_first(self, dtype, type_suffix, writable):
276285
N = 100
277286
duplicated = get_ht_function("duplicated", type_suffix)

pandas/tests/series/methods/test_value_counts.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ def test_value_counts_categorical_with_nan(self):
185185
(
186186
Series([False, True, True, pd.NA]),
187187
False,
188-
Series([2, 1, 1], index=[True, pd.NA, False]),
188+
Series([2, 1, 1], index=[True, False, pd.NA]),
189189
),
190190
(
191191
Series([False, True, True, pd.NA]),
@@ -195,7 +195,7 @@ def test_value_counts_categorical_with_nan(self):
195195
(
196196
Series(range(3), index=[True, False, np.nan]).index,
197197
False,
198-
Series([1, 1, 1], index=[pd.NA, False, True]),
198+
Series([1, 1, 1], index=[True, False, np.nan]),
199199
),
200200
],
201201
)

pandas/tests/test_algos.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import pytest
77

88
from pandas._libs import algos as libalgos, hashtable as ht
9-
from pandas.compat import IS64, np_array_datetime64_compat
9+
from pandas.compat import np_array_datetime64_compat
1010
import pandas.util._test_decorators as td
1111

1212
from pandas.core.dtypes.common import (
@@ -1272,12 +1272,10 @@ def test_value_counts_uint64(self):
12721272
tm.assert_series_equal(result, expected)
12731273

12741274
arr = np.array([-1, 2 ** 63], dtype=object)
1275-
expected = Series([1, 1], index=[2 ** 63, -1])
1275+
expected = Series([1, 1], index=[-1, 2 ** 63])
12761276
result = algos.value_counts(arr)
12771277

1278-
# 32-bit linux has a different ordering
1279-
if IS64:
1280-
tm.assert_series_equal(result, expected)
1278+
tm.assert_series_equal(result, expected)
12811279

12821280

12831281
class TestDuplicated:

0 commit comments

Comments
 (0)