diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 7e69a8044a305..8ae2aa1659077 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -585,7 +585,7 @@ Performance Improvements - Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`) - 20x improvement in ``concat`` of Categoricals when categories are identical (:issue:`10587`) - Improved performance of ``to_datetime`` when specified format string is ISO8601 (:issue:`10178`) - +- 2x improvement of ``Series.value_counts`` for float dtype (:issue:`10821`) .. _whatsnew_0170.bug_fixes: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b0c7ff43bc7d8..0b11a2bae3973 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -232,7 +232,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, values = PeriodIndex(values, name=name) values = values.view(np.int64) - keys, counts = htable.value_count_int64(values) + keys, counts = htable.value_count_scalar64(values, dropna) if dropna: from pandas.tslib import iNaT @@ -244,7 +244,10 @@ def value_counts(values, sort=True, ascending=False, normalize=False, elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) - keys, counts = htable.value_count_int64(values) + keys, counts = htable.value_count_scalar64(values, dropna) + elif com.is_float_dtype(dtype): + values = com._ensure_float64(values) + keys, counts = htable.value_count_scalar64(values, dropna) else: values = com._ensure_object(values) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index c9e30ea31dab8..b204cba997b98 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1030,7 +1030,7 @@ def value_counts(self, dropna=True): from pandas.core.index import CategoricalIndex cat = self.dropna() if dropna else self - keys, counts = htable.value_count_int64(com._ensure_int64(cat._codes)) + keys, counts = htable.value_count_scalar64(com._ensure_int64(cat._codes), dropna) result = Series(counts, index=keys) ix = np.arange(len(cat.categories), dtype='int64') diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 7dbd1b45c938f..dfa7930ada62f 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -866,51 +866,90 @@ cdef class Int64Factorizer: self.count = len(self.uniques) return labels - +ctypedef fused kh_scalar64: + kh_int64_t + kh_float64_t @cython.boundscheck(False) -cdef build_count_table_int64(int64_t[:] values, kh_int64_t *table): +cdef build_count_table_scalar64(sixty_four_bit_scalar[:] values, + kh_scalar64 *table, bint dropna): cdef: khiter_t k Py_ssize_t i, n = len(values) - int64_t val + sixty_four_bit_scalar val int ret = 0 - with nogil: - kh_resize_int64(table, n) + if sixty_four_bit_scalar is float64_t and kh_scalar64 is kh_float64_t: + with nogil: + kh_resize_float64(table, n) + + for i in range(n): + val = values[i] + if val == val or not dropna: + k = kh_get_float64(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_float64(table, val, &ret) + table.vals[k] = 1 + elif sixty_four_bit_scalar is int64_t and kh_scalar64 is kh_int64_t: + with nogil: + kh_resize_int64(table, n) + + for i in range(n): + val = values[i] + k = kh_get_int64(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_int64(table, val, &ret) + table.vals[k] = 1 + else: + raise ValueError("Table type must match scalar type.") - for i in range(n): - val = values[i] - k = kh_get_int64(table, val) - if k != table.n_buckets: - table.vals[k] += 1 - else: - k = kh_put_int64(table, val, &ret) - table.vals[k] = 1 @cython.boundscheck(False) -cpdef value_count_int64(int64_t[:] values): +cpdef value_count_scalar64(sixty_four_bit_scalar[:] values, bint dropna): cdef: Py_ssize_t i - kh_int64_t *table - int64_t[:] result_keys, result_counts + kh_float64_t *ftable + kh_int64_t *itable + sixty_four_bit_scalar[:] result_keys + int64_t[:] result_counts int k - table = kh_init_int64() - build_count_table_int64(values, table) - i = 0 - result_keys = np.empty(table.n_occupied, dtype=np.int64) - result_counts = np.zeros(table.n_occupied, dtype=np.int64) - with nogil: - for k in range(table.n_buckets): - if kh_exist_int64(table, k): - result_keys[i] = table.keys[k] - result_counts[i] = table.vals[k] - i += 1 - kh_destroy_int64(table) + if sixty_four_bit_scalar is float64_t: + ftable = kh_init_float64() + build_count_table_scalar64(values, ftable, dropna) + + result_keys = np.empty(ftable.n_occupied, dtype=np.float64) + result_counts = np.zeros(ftable.n_occupied, dtype=np.int64) + + with nogil: + for k in range(ftable.n_buckets): + if kh_exist_float64(ftable, k): + result_keys[i] = ftable.keys[k] + result_counts[i] = ftable.vals[k] + i += 1 + kh_destroy_float64(ftable) + + elif sixty_four_bit_scalar is int64_t: + itable = kh_init_int64() + build_count_table_scalar64(values, itable, dropna) + + result_keys = np.empty(itable.n_occupied, dtype=np.int64) + result_counts = np.zeros(itable.n_occupied, dtype=np.int64) + + with nogil: + for k in range(itable.n_buckets): + if kh_exist_int64(itable, k): + result_keys[i] = itable.keys[k] + result_counts[i] = itable.vals[k] + i += 1 + kh_destroy_int64(itable) return np.asarray(result_keys), np.asarray(result_counts) @@ -1002,7 +1041,7 @@ def mode_int64(int64_t[:] values): table = kh_init_int64() - build_count_table_int64(values, table) + build_count_table_scalar64(values, table, 0) modes = np.empty(table.n_buckets, dtype=np.int64) diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index 73f5f19d6a626..bceb78c26e6ac 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -194,6 +194,15 @@ def f(): series_value_counts_strings = Benchmark('s.value_counts()', setup, start_date=datetime(2011, 10, 21)) +#value_counts on float dtype + +setup = common_setup + """ +s = Series(np.random.randint(0, 1000, size=100000)).astype(float) +""" + +series_value_counts_float64 = Benchmark('s.value_counts()', setup, + start_date=datetime(2015, 8, 17)) + #---------------------------------------------------------------------- # pivot_table