Skip to content

PERF: value_counts_float64 #10821 #10840

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Aug 18, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,7 @@ Performance Improvements
- Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`)
- 20x improvement in ``concat`` of Categoricals when categories are identical (:issue:`10587`)
- Improved performance of ``to_datetime`` when specified format string is ISO8601 (:issue:`10178`)

- 2x improvement of ``Series.value_counts`` for float dtype (:issue:`10821`)

.. _whatsnew_0170.bug_fixes:

Expand Down
7 changes: 5 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
values = PeriodIndex(values, name=name)

values = values.view(np.int64)
keys, counts = htable.value_count_int64(values)
keys, counts = htable.value_count_scalar64(values, dropna)

if dropna:
from pandas.tslib import iNaT
Expand All @@ -244,7 +244,10 @@ def value_counts(values, sort=True, ascending=False, normalize=False,

elif com.is_integer_dtype(dtype):
values = com._ensure_int64(values)
keys, counts = htable.value_count_int64(values)
keys, counts = htable.value_count_scalar64(values, dropna)
elif com.is_float_dtype(dtype):
values = com._ensure_float64(values)
keys, counts = htable.value_count_scalar64(values, dropna)

else:
values = com._ensure_object(values)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1030,7 +1030,7 @@ def value_counts(self, dropna=True):
from pandas.core.index import CategoricalIndex

cat = self.dropna() if dropna else self
keys, counts = htable.value_count_int64(com._ensure_int64(cat._codes))
keys, counts = htable.value_count_scalar64(com._ensure_int64(cat._codes), dropna)
result = Series(counts, index=keys)

ix = np.arange(len(cat.categories), dtype='int64')
Expand Down
97 changes: 68 additions & 29 deletions pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -866,51 +866,90 @@ cdef class Int64Factorizer:
self.count = len(self.uniques)
return labels


ctypedef fused kh_scalar64:
kh_int64_t
kh_float64_t

@cython.boundscheck(False)
cdef build_count_table_int64(int64_t[:] values, kh_int64_t *table):
cdef build_count_table_scalar64(sixty_four_bit_scalar[:] values,
kh_scalar64 *table, bint dropna):
cdef:
khiter_t k
Py_ssize_t i, n = len(values)
int64_t val
sixty_four_bit_scalar val
int ret = 0

with nogil:
kh_resize_int64(table, n)
if sixty_four_bit_scalar is float64_t and kh_scalar64 is kh_float64_t:
with nogil:
kh_resize_float64(table, n)

for i in range(n):
val = values[i]
if val == val or not dropna:
k = kh_get_float64(table, val)
if k != table.n_buckets:
table.vals[k] += 1
else:
k = kh_put_float64(table, val, &ret)
table.vals[k] = 1
elif sixty_four_bit_scalar is int64_t and kh_scalar64 is kh_int64_t:
with nogil:
kh_resize_int64(table, n)

for i in range(n):
val = values[i]
k = kh_get_int64(table, val)
if k != table.n_buckets:
table.vals[k] += 1
else:
k = kh_put_int64(table, val, &ret)
table.vals[k] = 1
else:
raise ValueError("Table type must match scalar type.")

for i in range(n):
val = values[i]
k = kh_get_int64(table, val)
if k != table.n_buckets:
table.vals[k] += 1
else:
k = kh_put_int64(table, val, &ret)
table.vals[k] = 1


@cython.boundscheck(False)
cpdef value_count_int64(int64_t[:] values):
cpdef value_count_scalar64(sixty_four_bit_scalar[:] values, bint dropna):
cdef:
Py_ssize_t i
kh_int64_t *table
int64_t[:] result_keys, result_counts
kh_float64_t *ftable
kh_int64_t *itable
sixty_four_bit_scalar[:] result_keys
int64_t[:] result_counts
int k

table = kh_init_int64()
build_count_table_int64(values, table)

i = 0
result_keys = np.empty(table.n_occupied, dtype=np.int64)
result_counts = np.zeros(table.n_occupied, dtype=np.int64)

with nogil:
for k in range(table.n_buckets):
if kh_exist_int64(table, k):
result_keys[i] = table.keys[k]
result_counts[i] = table.vals[k]
i += 1
kh_destroy_int64(table)
if sixty_four_bit_scalar is float64_t:
ftable = kh_init_float64()
build_count_table_scalar64(values, ftable, dropna)

result_keys = np.empty(ftable.n_occupied, dtype=np.float64)
result_counts = np.zeros(ftable.n_occupied, dtype=np.int64)

with nogil:
for k in range(ftable.n_buckets):
if kh_exist_float64(ftable, k):
result_keys[i] = ftable.keys[k]
result_counts[i] = ftable.vals[k]
i += 1
kh_destroy_float64(ftable)

elif sixty_four_bit_scalar is int64_t:
itable = kh_init_int64()
build_count_table_scalar64(values, itable, dropna)

result_keys = np.empty(itable.n_occupied, dtype=np.int64)
result_counts = np.zeros(itable.n_occupied, dtype=np.int64)

with nogil:
for k in range(itable.n_buckets):
if kh_exist_int64(itable, k):
result_keys[i] = itable.keys[k]
result_counts[i] = itable.vals[k]
i += 1
kh_destroy_int64(itable)

return np.asarray(result_keys), np.asarray(result_counts)

Expand Down Expand Up @@ -1002,7 +1041,7 @@ def mode_int64(int64_t[:] values):

table = kh_init_int64()

build_count_table_int64(values, table)
build_count_table_scalar64(values, table, 0)

modes = np.empty(table.n_buckets, dtype=np.int64)

Expand Down
9 changes: 9 additions & 0 deletions vb_suite/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,15 @@ def f():
series_value_counts_strings = Benchmark('s.value_counts()', setup,
start_date=datetime(2011, 10, 21))

#value_counts on float dtype

setup = common_setup + """
s = Series(np.random.randint(0, 1000, size=100000)).astype(float)
"""

series_value_counts_float64 = Benchmark('s.value_counts()', setup,
start_date=datetime(2015, 8, 17))

#----------------------------------------------------------------------
# pivot_table

Expand Down