Skip to content

ENH: making value_counts stable/keeping original ordering #39009

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jan 22, 2021
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ Reshaping
- Bug in :func:`join` over :class:`MultiIndex` returned wrong result, when one of both indexes had only one level (:issue:`36909`)
- :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`)
- Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`)
- :meth:`Series.value_counts` returns keys in original order (:issue:`12679`, :issue:`11227`)
- Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`)
-

Expand Down
7 changes: 0 additions & 7 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,6 @@ cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil:
res.imag = val.imag
return res


cdef {{name}}_t to_{{name}}(kh{{name}}_t val) nogil:
cdef {{name}}_t res
res.real = val.real
res.imag = val.imag
return res

{{endfor}}


Expand Down
115 changes: 69 additions & 46 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,26 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in

{{py:

# dtype, ttype, c_type, to_c_type, to_dtype
dtypes = [('complex128', 'complex128', 'khcomplex128_t',
'to_khcomplex128_t', 'to_complex128'),
('complex64', 'complex64', 'khcomplex64_t',
'to_khcomplex64_t', 'to_complex64'),
('float64', 'float64', 'float64_t', '', ''),
('float32', 'float32', 'float32_t', '', ''),
('uint64', 'uint64', 'uint64_t', '', ''),
('uint32', 'uint32', 'uint32_t', '', ''),
('uint16', 'uint16', 'uint16_t', '', ''),
('uint8', 'uint8', 'uint8_t', '', ''),
('object', 'pymap', 'object', '', ''),
('int64', 'int64', 'int64_t', '', ''),
('int32', 'int32', 'int32_t', '', ''),
('int16', 'int16', 'int16_t', '', ''),
('int8', 'int8', 'int8_t', '', '')]
# name, dtype, ttype, c_type, to_c_type
dtypes = [('Complex128', 'complex128', 'complex128',
'khcomplex128_t', 'to_khcomplex128_t'),
('Complex64', 'complex64', 'complex64',
'khcomplex64_t', 'to_khcomplex64_t'),
('Float64', 'float64', 'float64', 'float64_t', ''),
('Float32', 'float32', 'float32', 'float32_t', ''),
('UInt64', 'uint64', 'uint64', 'uint64_t', ''),
('UInt32', 'uint32', 'uint32', 'uint32_t', ''),
('UInt16', 'uint16', 'uint16', 'uint16_t', ''),
('UInt8', 'uint8', 'uint8', 'uint8_t', ''),
('Object', 'object', 'pymap', 'object', ''),
('Int64', 'int64', 'int64', 'int64_t', ''),
('Int32', 'int32', 'int32', 'int32_t', ''),
('Int16', 'int16', 'int16', 'int16_t', ''),
('Int8', 'int8', 'int8', 'int8_t', '')]

}}

{{for dtype, ttype, c_type, to_c_type, to_dtype in dtypes}}
{{for name, dtype, ttype, c_type, to_c_type in dtypes}}


@cython.wraparound(False)
Expand Down Expand Up @@ -77,54 +77,77 @@ cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values,
@cython.wraparound(False)
@cython.boundscheck(False)
{{if dtype == 'object'}}
cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN):
{{else}}
cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
{{endif}}
cdef:
Py_ssize_t i = 0
Py_ssize_t n = len(values)
size_t unique_key_index = 0
size_t unique_key_count = 0
kh_{{ttype}}_t *table

{{if dtype != 'object'}}
{{dtype}}_t[:] result_keys
int64_t[:] result_counts
{{endif}}

# Don't use Py_ssize_t, since table.n_buckets is unsigned
khiter_t k
bint is_null

{{c_type}} val

int ret = 0

# we track the order in which keys are first seen (GH39009),
# khash-map isn't insertion-ordered, thus:
# table maps key to index_of_appearence
# result_keys maps index_of_appearence to key
# result_counts maps index_of_appearence to number of elements

result_keys = {{name}}Vector()
result_counts = Int64Vector()
table = kh_init_{{ttype}}()

{{if dtype == 'object'}}
build_count_table_{{dtype}}(values, table, 1)
kh_resize_{{ttype}}(table, n // 10)

for i in range(n):
val = values[i]
is_null = checknull(val)
if not is_null or not dropna:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could be elif

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see how it could work with elif (but I'm always struggeling with logic). I have rearanged the if and hope it is clearer now: 230215f

# all nas become the same representative:
if is_null:
val = navalue
k = kh_get_{{ttype}}(table, <PyObject*>val)
if k != table.n_buckets:
unique_key_index = table.vals[k]
result_counts.data.data[unique_key_index] += 1
else:
k = kh_put_{{ttype}}(table, <PyObject*>val, &ret)
table.vals[k] = unique_key_count
result_keys.append(val)
result_counts.append(1)
unique_key_count+=1
{{else}}
build_count_table_{{dtype}}(values, table, dropna)
{{endif}}
kh_resize_{{ttype}}(table, n)

result_keys = np.empty(table.n_occupied, '{{dtype}}')
result_counts = np.zeros(table.n_occupied, dtype=np.int64)
for i in range(n):
val = {{to_c_type}}(values[i])

{{if dtype == 'object'}}
for k in range(table.n_buckets):
if kh_exist_{{ttype}}(table, k):
result_keys[i] = <{{dtype}}>table.keys[k]
result_counts[i] = table.vals[k]
i += 1
{{else}}
with nogil:
for k in range(table.n_buckets):
if kh_exist_{{ttype}}(table, k):
result_keys[i] = {{to_dtype}}(table.keys[k])
result_counts[i] = table.vals[k]
i += 1
if not is_nan_{{c_type}}(val) or not dropna:
k = kh_get_{{ttype}}(table, val)
if k != table.n_buckets:
unique_key_index = table.vals[k]
result_counts.data.data[unique_key_index] += 1
else:
k = kh_put_{{ttype}}(table, val, &ret)
table.vals[k] = unique_key_count
result_keys.append(val)
result_counts.append(1)
unique_key_count+=1
{{endif}}

kh_destroy_{{ttype}}(table)

{{if dtype == 'object'}}
return result_keys, result_counts
{{else}}
return np.asarray(result_keys), np.asarray(result_counts)
{{endif}}
return result_keys.to_array(), result_counts.to_array()


@cython.wraparound(False)
Expand Down
5 changes: 0 additions & 5 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -866,11 +866,6 @@ def value_counts_arraylike(values, dropna: bool):
f = getattr(htable, f"value_count_{ndtype}")
keys, counts = f(values, dropna)

mask = isna(values)
if not dropna and mask.any() and not isna(keys).any():
keys = np.insert(keys, 0, np.NaN)
counts = np.insert(counts, 0, mask.sum())

keys = _reconstruct_data(keys, original.dtype, original)

return keys, counts
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1004,9 +1004,9 @@ def value_counts(
>>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
>>> index.value_counts()
3.0 2
1.0 1
2.0 1
4.0 1
1.0 1
dtype: int64

With `normalize` set to `True`, returns the relative frequency by
Expand All @@ -1015,9 +1015,9 @@ def value_counts(
>>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
>>> s.value_counts(normalize=True)
3.0 0.4
1.0 0.2
2.0 0.2
4.0 0.2
1.0 0.2
dtype: float64

**bins**
Expand All @@ -1039,10 +1039,10 @@ def value_counts(

>>> s.value_counts(dropna=False)
3.0 2
1.0 1
2.0 1
NaN 1
4.0 1
1.0 1
NaN 1
dtype: int64
"""
return value_counts(
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/arrays/boolean/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,18 +77,18 @@ def test_ufunc_reduce_raises(values):
def test_value_counts_na():
arr = pd.array([True, False, pd.NA], dtype="boolean")
result = arr.value_counts(dropna=False)
expected = pd.Series([1, 1, 1], index=[False, True, pd.NA], dtype="Int64")
expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64")
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
expected = pd.Series([1, 1], index=[False, True], dtype="Int64")
expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize():
s = pd.Series([True, False, pd.NA], dtype="boolean")
result = s.value_counts(normalize=True)
expected = pd.Series([1, 1], index=[False, True], dtype="Float64") / 2
expected = pd.Series([1, 1], index=[True, False], dtype="Float64") / 2
tm.assert_series_equal(result, expected)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ def test_value_counts_na(dtype, request):

arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
result = arr.value_counts(dropna=False)
expected = pd.Series([2, 1, 1], index=["a", pd.NA, "b"], dtype="Int64")
expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64")
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/test_datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def test_value_counts_preserves_tz(self):

arr[-2] = pd.NaT
result = arr.value_counts()
expected = pd.Series([1, 4, 2], index=[pd.NaT, dti[0], dti[1]])
expected = pd.Series([4, 2, 1], index=[dti[0], dti[1], pd.NaT])
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("method", ["pad", "backfill"])
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ def test_describe_does_not_raise_error_for_dictlike_elements(self):
# GH#32409
df = DataFrame([{"test": {"a": "1"}}, {"test": {"a": "2"}}])
expected = DataFrame(
{"test": [2, 2, {"a": "2"}, 1]}, index=["count", "unique", "top", "freq"]
{"test": [2, 2, {"a": "1"}, 1]}, index=["count", "unique", "top", "freq"]
)
result = df.describe()
tm.assert_frame_equal(result, expected)
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/libs/test_hashtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,15 @@ def test_value_count(self, dtype, type_suffix, writable):
tm.assert_numpy_array_equal(np.sort(keys), expected)
assert np.all(counts == 5)

def test_value_count_stable(self, dtype, type_suffix, writable):
# GH12679
value_count = get_ht_function("value_count", type_suffix)
values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
values.flags.writeable = writable
keys, counts = value_count(values, False)
tm.assert_numpy_array_equal(keys, values)
assert np.all(counts == 1)

def test_duplicated_first(self, dtype, type_suffix, writable):
N = 100
duplicated = get_ht_function("duplicated", type_suffix)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/series/methods/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def test_value_counts_categorical_with_nan(self):
(
Series([False, True, True, pd.NA]),
False,
Series([2, 1, 1], index=[True, pd.NA, False]),
Series([2, 1, 1], index=[True, False, pd.NA]),
),
(
Series([False, True, True, pd.NA]),
Expand All @@ -195,7 +195,7 @@ def test_value_counts_categorical_with_nan(self):
(
Series(range(3), index=[True, False, np.nan]).index,
False,
Series([1, 1, 1], index=[pd.NA, False, True]),
Series([1, 1, 1], index=[True, False, np.nan]),
),
],
)
Expand Down
8 changes: 3 additions & 5 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pytest

from pandas._libs import algos as libalgos, hashtable as ht
from pandas.compat import IS64, np_array_datetime64_compat
from pandas.compat import np_array_datetime64_compat
import pandas.util._test_decorators as td

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -1272,12 +1272,10 @@ def test_value_counts_uint64(self):
tm.assert_series_equal(result, expected)

arr = np.array([-1, 2 ** 63], dtype=object)
expected = Series([1, 1], index=[2 ** 63, -1])
expected = Series([1, 1], index=[-1, 2 ** 63])
result = algos.value_counts(arr)

# 32-bit linux has a different ordering
if IS64:
tm.assert_series_equal(result, expected)
tm.assert_series_equal(result, expected)


class TestDuplicated:
Expand Down