From f0706b1843883f12807baeaad8a6046dd8f767e9 Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 17 Aug 2015 20:46:55 -0500 Subject: [PATCH 1/3] PERF: value_counts_float64 #10821 --- doc/source/whatsnew/v0.17.0.txt | 2 +- pandas/core/algorithms.py | 3 +++ pandas/hashtable.pyx | 45 +++++++++++++++++++++++++++++++++ vb_suite/groupby.py | 9 +++++++ 4 files changed, 58 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 7e69a8044a305..8ae2aa1659077 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -585,7 +585,7 @@ Performance Improvements - Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`) - 20x improvement in ``concat`` of Categoricals when categories are identical (:issue:`10587`) - Improved performance of ``to_datetime`` when specified format string is ISO8601 (:issue:`10178`) - +- 2x improvement of ``Series.value_counts`` for float dtype (:issue:`10821`) .. _whatsnew_0170.bug_fixes: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b0c7ff43bc7d8..21ace4bb1832d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -245,6 +245,9 @@ def value_counts(values, sort=True, ascending=False, normalize=False, elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_int64(values) + elif com.is_float_dtype(dtype): + values = com._ensure_float64(values) + keys, counts = htable.value_count_float64(values, dropna) else: values = com._ensure_object(values) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 7dbd1b45c938f..fa3e2f6c4ba0b 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -866,7 +866,52 @@ cdef class Int64Factorizer: self.count = len(self.uniques) return labels +@cython.boundscheck(False) +cdef build_count_table_float64(float64_t[:] values, kh_float64_t *table, bint dropna): + cdef: + khiter_t k + Py_ssize_t i, n = len(values) + float64_t val + int ret = 0 + + with nogil: + kh_resize_float64(table, n) + + for i in range(n): + val = values[i] + if val == val or not dropna: + k = kh_get_float64(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_float64(table, val, &ret) + table.vals[k] = 1 + +@cython.boundscheck(False) +cpdef value_count_float64(float64_t[:] values, bint dropna): + cdef: + Py_ssize_t i + kh_float64_t * table + float64_t[:] result_keys + int64_t[:] result_counts + int k + + table = kh_init_float64() + build_count_table_float64(values, table, dropna) + + i = 0 + result_keys = np.empty(table.n_occupied, dtype=np.float64) + result_counts = np.zeros(table.n_occupied, dtype=np.int64) + with nogil: + for k in range(table.n_buckets): + if kh_exist_float64(table, k): + result_keys[i] = table.keys[k] + result_counts[i] = table.vals[k] + i += 1 + kh_destroy_float64(table) + + return np.asarray(result_keys), np.asarray(result_counts) @cython.boundscheck(False) cdef build_count_table_int64(int64_t[:] values, kh_int64_t *table): diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index 73f5f19d6a626..bceb78c26e6ac 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -194,6 +194,15 @@ def f(): series_value_counts_strings = Benchmark('s.value_counts()', setup, start_date=datetime(2011, 10, 21)) +#value_counts on float dtype + +setup = common_setup + """ +s = Series(np.random.randint(0, 1000, size=100000)).astype(float) +""" + +series_value_counts_float64 = Benchmark('s.value_counts()', setup, + start_date=datetime(2015, 8, 17)) + #---------------------------------------------------------------------- # pivot_table From cf002dcce4adcf45e1256b6f16aa2e2833a4cfc5 Mon Sep 17 00:00:00 2001 From: Ian Henriksen Date: Tue, 18 Aug 2015 11:25:25 -0600 Subject: [PATCH 2/3] CLN: Combined value_count_in64 and value_count_float64 into a single routine using fused types. --- pandas/core/algorithms.py | 6 +-- pandas/core/categorical.py | 2 +- pandas/hashtable.pyx | 75 +++++++++++++++++--------------------- 3 files changed, 38 insertions(+), 45 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 21ace4bb1832d..0b11a2bae3973 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -232,7 +232,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, values = PeriodIndex(values, name=name) values = values.view(np.int64) - keys, counts = htable.value_count_int64(values) + keys, counts = htable.value_count_scalar64(values, dropna) if dropna: from pandas.tslib import iNaT @@ -244,10 +244,10 @@ def value_counts(values, sort=True, ascending=False, normalize=False, elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) - keys, counts = htable.value_count_int64(values) + keys, counts = htable.value_count_scalar64(values, dropna) elif com.is_float_dtype(dtype): values = com._ensure_float64(values) - keys, counts = htable.value_count_float64(values, dropna) + keys, counts = htable.value_count_scalar64(values, dropna) else: values = com._ensure_object(values) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index c9e30ea31dab8..b204cba997b98 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1030,7 +1030,7 @@ def value_counts(self, dropna=True): from pandas.core.index import CategoricalIndex cat = self.dropna() if dropna else self - keys, counts = htable.value_count_int64(com._ensure_int64(cat._codes)) + keys, counts = htable.value_count_scalar64(com._ensure_int64(cat._codes), dropna) result = Series(counts, index=keys) ix = np.arange(len(cat.categories), dtype='int64') diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index fa3e2f6c4ba0b..573db92b53565 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -887,29 +887,48 @@ cdef build_count_table_float64(float64_t[:] values, kh_float64_t *table, bint dr k = kh_put_float64(table, val, &ret) table.vals[k] = 1 + @cython.boundscheck(False) -cpdef value_count_float64(float64_t[:] values, bint dropna): +cpdef value_count_scalar64(sixty_four_bit_scalar[:] values, bint dropna): cdef: Py_ssize_t i - kh_float64_t * table - float64_t[:] result_keys + kh_float64_t *ftable + kh_int64_t *itable + sixty_four_bit_scalar[:] result_keys int64_t[:] result_counts int k - table = kh_init_float64() - build_count_table_float64(values, table, dropna) - i = 0 - result_keys = np.empty(table.n_occupied, dtype=np.float64) - result_counts = np.zeros(table.n_occupied, dtype=np.int64) - with nogil: - for k in range(table.n_buckets): - if kh_exist_float64(table, k): - result_keys[i] = table.keys[k] - result_counts[i] = table.vals[k] - i += 1 - kh_destroy_float64(table) + if sixty_four_bit_scalar is float64_t: + ftable = kh_init_float64() + build_count_table_float64(values, ftable, dropna) + + result_keys = np.empty(ftable.n_occupied, dtype=np.float64) + result_counts = np.zeros(ftable.n_occupied, dtype=np.int64) + + with nogil: + for k in range(ftable.n_buckets): + if kh_exist_float64(ftable, k): + result_keys[i] = ftable.keys[k] + result_counts[i] = ftable.vals[k] + i += 1 + kh_destroy_float64(ftable) + + elif sixty_four_bit_scalar is int64_t: + itable = kh_init_int64() + build_count_table_int64(values, itable) + + result_keys = np.empty(itable.n_occupied, dtype=np.int64) + result_counts = np.zeros(itable.n_occupied, dtype=np.int64) + + with nogil: + for k in range(itable.n_buckets): + if kh_exist_int64(itable, k): + result_keys[i] = itable.keys[k] + result_counts[i] = itable.vals[k] + i += 1 + kh_destroy_int64(itable) return np.asarray(result_keys), np.asarray(result_counts) @@ -934,32 +953,6 @@ cdef build_count_table_int64(int64_t[:] values, kh_int64_t *table): table.vals[k] = 1 -@cython.boundscheck(False) -cpdef value_count_int64(int64_t[:] values): - cdef: - Py_ssize_t i - kh_int64_t *table - int64_t[:] result_keys, result_counts - int k - - table = kh_init_int64() - build_count_table_int64(values, table) - - i = 0 - result_keys = np.empty(table.n_occupied, dtype=np.int64) - result_counts = np.zeros(table.n_occupied, dtype=np.int64) - - with nogil: - for k in range(table.n_buckets): - if kh_exist_int64(table, k): - result_keys[i] = table.keys[k] - result_counts[i] = table.vals[k] - i += 1 - kh_destroy_int64(table) - - return np.asarray(result_keys), np.asarray(result_counts) - - cdef build_count_table_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask, kh_pymap_t *table): From 8bb17cb02c31bb89e87a8fbddec0537fabd1d81d Mon Sep 17 00:00:00 2001 From: Ian Henriksen Date: Tue, 18 Aug 2015 11:39:15 -0600 Subject: [PATCH 3/3] CLN: Combined build_count_table_int64 and build_count_table_float64 into a single function using fused types. --- pandas/hashtable.pyx | 65 ++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 573db92b53565..dfa7930ada62f 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -866,26 +866,47 @@ cdef class Int64Factorizer: self.count = len(self.uniques) return labels +ctypedef fused kh_scalar64: + kh_int64_t + kh_float64_t + @cython.boundscheck(False) -cdef build_count_table_float64(float64_t[:] values, kh_float64_t *table, bint dropna): +cdef build_count_table_scalar64(sixty_four_bit_scalar[:] values, + kh_scalar64 *table, bint dropna): cdef: khiter_t k Py_ssize_t i, n = len(values) - float64_t val + sixty_four_bit_scalar val int ret = 0 - with nogil: - kh_resize_float64(table, n) + if sixty_four_bit_scalar is float64_t and kh_scalar64 is kh_float64_t: + with nogil: + kh_resize_float64(table, n) - for i in range(n): - val = values[i] - if val == val or not dropna: - k = kh_get_float64(table, val) + for i in range(n): + val = values[i] + if val == val or not dropna: + k = kh_get_float64(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_float64(table, val, &ret) + table.vals[k] = 1 + elif sixty_four_bit_scalar is int64_t and kh_scalar64 is kh_int64_t: + with nogil: + kh_resize_int64(table, n) + + for i in range(n): + val = values[i] + k = kh_get_int64(table, val) if k != table.n_buckets: table.vals[k] += 1 else: - k = kh_put_float64(table, val, &ret) + k = kh_put_int64(table, val, &ret) table.vals[k] = 1 + else: + raise ValueError("Table type must match scalar type.") + @cython.boundscheck(False) @@ -902,7 +923,7 @@ cpdef value_count_scalar64(sixty_four_bit_scalar[:] values, bint dropna): if sixty_four_bit_scalar is float64_t: ftable = kh_init_float64() - build_count_table_float64(values, ftable, dropna) + build_count_table_scalar64(values, ftable, dropna) result_keys = np.empty(ftable.n_occupied, dtype=np.float64) result_counts = np.zeros(ftable.n_occupied, dtype=np.int64) @@ -917,7 +938,7 @@ cpdef value_count_scalar64(sixty_four_bit_scalar[:] values, bint dropna): elif sixty_four_bit_scalar is int64_t: itable = kh_init_int64() - build_count_table_int64(values, itable) + build_count_table_scalar64(values, itable, dropna) result_keys = np.empty(itable.n_occupied, dtype=np.int64) result_counts = np.zeros(itable.n_occupied, dtype=np.int64) @@ -932,26 +953,6 @@ cpdef value_count_scalar64(sixty_four_bit_scalar[:] values, bint dropna): return np.asarray(result_keys), np.asarray(result_counts) -@cython.boundscheck(False) -cdef build_count_table_int64(int64_t[:] values, kh_int64_t *table): - cdef: - khiter_t k - Py_ssize_t i, n = len(values) - int64_t val - int ret = 0 - - with nogil: - kh_resize_int64(table, n) - - for i in range(n): - val = values[i] - k = kh_get_int64(table, val) - if k != table.n_buckets: - table.vals[k] += 1 - else: - k = kh_put_int64(table, val, &ret) - table.vals[k] = 1 - cdef build_count_table_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask, @@ -1040,7 +1041,7 @@ def mode_int64(int64_t[:] values): table = kh_init_int64() - build_count_table_int64(values, table) + build_count_table_scalar64(values, table, 0) modes = np.empty(table.n_buckets, dtype=np.int64)