From 339ad1a3ee84c93a2f691d4d19d86860008bf3fd Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 13 Jun 2016 21:00:25 -0500 Subject: [PATCH 1/2] PERF: float hash slow in py3 --- asv_bench/benchmarks/groupby.py | 137 ++++++++----------------------- asv_bench/benchmarks/indexing.py | 15 +++- doc/source/whatsnew/v0.18.2.txt | 2 +- pandas/src/klib/khash_python.h | 7 +- 4 files changed, 55 insertions(+), 106 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 586bd00b091fe..c9ee46d9bd531 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -379,15 +379,24 @@ def time_groupby_dt_timegrouper_size(self): #---------------------------------------------------------------------- # groupby with a variable value for ngroups -class groupby_ngroups_10000(object): +class groupby_ngroups_int_10000(object): goal_time = 0.2 + dtype = 'int' + ngroups = 10000 def setup(self): np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + size = self.ngroups * 2 + rng = np.arange(self.ngroups) + ts = rng.take(np.random.randint(0, self.ngroups, size=size)) + if self.dtype == 'int': + value = np.random.randint(0, size, size=size) + else: + value = np.concatenate([np.random.random(self.ngroups) * 0.1, + np.random.random(self.ngroups) * 10.0]) + + self.df = DataFrame({'timestamp': ts, + 'value': value}) def time_all(self): self.df.groupby('value')['timestamp'].all() @@ -482,109 +491,35 @@ def time_value_counts(self): def time_var(self): self.df.groupby('value')['timestamp'].var() - -class groupby_ngroups_100(object): +class groupby_ngroups_int_100(groupby_ngroups_int_10000): goal_time = 0.2 + dtype = 'int' + ngroups = 100 - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_all(self): - self.df.groupby('value')['timestamp'].all() - - def time_any(self): - self.df.groupby('value')['timestamp'].any() - - def time_count(self): - self.df.groupby('value')['timestamp'].count() - - def time_cumcount(self): - self.df.groupby('value')['timestamp'].cumcount() - - def time_cummax(self): - self.df.groupby('value')['timestamp'].cummax() - - def time_cummin(self): - self.df.groupby('value')['timestamp'].cummin() - - def time_cumprod(self): - self.df.groupby('value')['timestamp'].cumprod() - - def time_cumsum(self): - self.df.groupby('value')['timestamp'].cumsum() - - def time_describe(self): - self.df.groupby('value')['timestamp'].describe() - - def time_diff(self): - self.df.groupby('value')['timestamp'].diff() - - def time_first(self): - self.df.groupby('value')['timestamp'].first() - - def time_head(self): - self.df.groupby('value')['timestamp'].head() - - def time_last(self): - self.df.groupby('value')['timestamp'].last() - - def time_mad(self): - self.df.groupby('value')['timestamp'].mad() - - def time_max(self): - self.df.groupby('value')['timestamp'].max() - - def time_mean(self): - self.df.groupby('value')['timestamp'].mean() - - def time_median(self): - self.df.groupby('value')['timestamp'].median() - - def time_min(self): - self.df.groupby('value')['timestamp'].min() - - def time_nunique(self): - self.df.groupby('value')['timestamp'].nunique() - - def time_pct_change(self): - self.df.groupby('value')['timestamp'].pct_change() - - def time_prod(self): - self.df.groupby('value')['timestamp'].prod() - - def time_rank(self): - self.df.groupby('value')['timestamp'].rank() - - def time_sem(self): - self.df.groupby('value')['timestamp'].sem() - - def time_size(self): - self.df.groupby('value')['timestamp'].size() - - def time_skew(self): - self.df.groupby('value')['timestamp'].skew() - - def time_std(self): - self.df.groupby('value')['timestamp'].std() +class groupby_ngroups_float_100(groupby_ngroups_int_10000): + goal_time = 0.2 + dtype = 'float' + ngroups = 100 - def time_sum(self): - self.df.groupby('value')['timestamp'].sum() +class groupby_ngroups_float_10000(groupby_ngroups_int_10000): + goal_time = 0.2 + dtype = 'float' + ngroups = 10000 - def time_tail(self): - self.df.groupby('value')['timestamp'].tail() - def time_unique(self): - self.df.groupby('value')['timestamp'].unique() +class groupby_float32(object): + # GH 13335 + goal_time = 0.2 - def time_value_counts(self): - self.df.groupby('value')['timestamp'].value_counts() + def setup(self): + tmp1 = (np.random.random(10000) * 0.1).astype(np.float32) + tmp2 = (np.random.random(10000) * 10.0).astype(np.float32) + tmp = np.concatenate((tmp1, tmp2)) + arr = np.repeat(tmp, 100) + self.df = DataFrame(dict(a=arr, b=arr)) - def time_var(self): - self.df.groupby('value')['timestamp'].var() + def time_groupby_sum(self): + self.df.groupby(['a'])['b'].sum() #---------------------------------------------------------------------- diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 32d80a7913234..dc15d9e25e481 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -486,4 +486,17 @@ def setup(self): self.midx = self.midx.take(np.random.permutation(np.arange(100000))) def time_sort_level_zero(self): - self.midx.sortlevel(0) \ No newline at end of file + self.midx.sortlevel(0) + +class float_loc(object): + # GH 13166 + goal_time = 0.2 + + def setup(self): + a = np.arange(1000000) + self.ind = pd.Float64Index(a * 4.8000000418824129e-08) + + def time_float_loc(self): + self.ind.get_loc(0) + + diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 105194e504f45..0b8b7b56fd36b 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -307,7 +307,7 @@ Performance Improvements - Improved performance of sparse arithmetic with ``BlockIndex`` when the number of blocks are large, though recommended to use ``IntIndex`` in such cases (:issue:`13082`) - increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`) - +- Improved performance of float64 hash table fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`) - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) diff --git a/pandas/src/klib/khash_python.h b/pandas/src/klib/khash_python.h index cdd94b5d8522f..d9fc012df6785 100644 --- a/pandas/src/klib/khash_python.h +++ b/pandas/src/klib/khash_python.h @@ -2,9 +2,10 @@ #include "khash.h" -// kludge - -#define kh_float64_hash_func _Py_HashDouble +inline khint64_t asint64(double key) { + return *(khint64_t *)(&key); +} +#define kh_float64_hash_func(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11) #define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) #define KHASH_MAP_INIT_FLOAT64(name, khval_t) \ From 3aec078d48670c6e3db6838f8b41667bf8b1088f Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 14 Jun 2016 19:59:44 -0500 Subject: [PATCH 2/2] smaller benches, explanatory comment --- asv_bench/benchmarks/groupby.py | 2 +- asv_bench/benchmarks/indexing.py | 2 +- pandas/src/klib/khash_python.h | 11 +++++++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index c9ee46d9bd531..0611a3564ff7a 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -515,7 +515,7 @@ def setup(self): tmp1 = (np.random.random(10000) * 0.1).astype(np.float32) tmp2 = (np.random.random(10000) * 10.0).astype(np.float32) tmp = np.concatenate((tmp1, tmp2)) - arr = np.repeat(tmp, 100) + arr = np.repeat(tmp, 10) self.df = DataFrame(dict(a=arr, b=arr)) def time_groupby_sum(self): diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index dc15d9e25e481..53d37a8161f43 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -493,7 +493,7 @@ class float_loc(object): goal_time = 0.2 def setup(self): - a = np.arange(1000000) + a = np.arange(100000) self.ind = pd.Float64Index(a * 4.8000000418824129e-08) def time_float_loc(self): diff --git a/pandas/src/klib/khash_python.h b/pandas/src/klib/khash_python.h index d9fc012df6785..7684493d08855 100644 --- a/pandas/src/klib/khash_python.h +++ b/pandas/src/klib/khash_python.h @@ -2,6 +2,17 @@ #include "khash.h" +// Previously we were using the built in cpython hash function for doubles +// python 2.7 https://github.com/python/cpython/blob/2.7/Objects/object.c#L1021 +// python 3.5 https://github.com/python/cpython/blob/3.5/Python/pyhash.c#L85 + +// The python 3 hash function has the invariant hash(x) == hash(int(x)) == hash(decimal(x)) +// and the size of hash may be different by platform / version (long in py2, Py_ssize_t in py3). +// We don't need those invariants because types will be cast before hashing, and if Py_ssize_t +// is 64 bits the truncation causes collission issues. Given all that, we use our own +// simple hash, viewing the double bytes as an int64 and using khash's default +// hash for 64 bit integers. +// GH 13436 inline khint64_t asint64(double key) { return *(khint64_t *)(&key); }