PERF: float hash slow in py3

chris-b1 · jreback · commit f98b4b541e7a · 2016-06-14T21:47:30.000-04:00
closes #13166 closes #13335 Author: Chris <cbartak@gmail.com> Closes #13436 from chris-b1/float-hash and squashes the following commits: 3aec078 [Chris] smaller benches, explanatory comment 339ad1a [Chris] PERF: float hash slow in py3
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -379,15 +379,24 @@ def time_groupby_dt_timegrouper_size(self):
 #----------------------------------------------------------------------
 # groupby with a variable value for ngroups
 
-class groupby_ngroups_10000(object):
+class groupby_ngroups_int_10000(object):
     goal_time = 0.2
+    dtype = 'int'
+    ngroups = 10000
 
     def setup(self):
         np.random.seed(1234)
-        self.ngroups = 10000
-        self.size = (self.ngroups * 2)
-        self.rng = np.arange(self.ngroups)
-        self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+        size = self.ngroups * 2
+        rng = np.arange(self.ngroups)
+        ts = rng.take(np.random.randint(0, self.ngroups, size=size))
+        if self.dtype == 'int':
+            value = np.random.randint(0, size, size=size)
+        else:
+            value = np.concatenate([np.random.random(self.ngroups) * 0.1,
+                                    np.random.random(self.ngroups) * 10.0])
+
+        self.df = DataFrame({'timestamp': ts,
+                             'value': value})
 
     def time_all(self):
         self.df.groupby('value')['timestamp'].all()
@@ -482,109 +491,35 @@ def time_value_counts(self):
     def time_var(self):
         self.df.groupby('value')['timestamp'].var()
 
-
-class groupby_ngroups_100(object):
+class groupby_ngroups_int_100(groupby_ngroups_int_10000):
     goal_time = 0.2
+    dtype = 'int'
+    ngroups = 100
 
-    def setup(self):
-        np.random.seed(1234)
-        self.ngroups = 100
-        self.size = (self.ngroups * 2)
-        self.rng = np.arange(self.ngroups)
-        self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
-
-    def time_all(self):
-        self.df.groupby('value')['timestamp'].all()
-
-    def time_any(self):
-        self.df.groupby('value')['timestamp'].any()
-
-    def time_count(self):
-        self.df.groupby('value')['timestamp'].count()
-
-    def time_cumcount(self):
-        self.df.groupby('value')['timestamp'].cumcount()
-
-    def time_cummax(self):
-        self.df.groupby('value')['timestamp'].cummax()
-
-    def time_cummin(self):
-        self.df.groupby('value')['timestamp'].cummin()
-
-    def time_cumprod(self):
-        self.df.groupby('value')['timestamp'].cumprod()
-
-    def time_cumsum(self):
-        self.df.groupby('value')['timestamp'].cumsum()
-
-    def time_describe(self):
-        self.df.groupby('value')['timestamp'].describe()
-
-    def time_diff(self):
-        self.df.groupby('value')['timestamp'].diff()
-
-    def time_first(self):
-        self.df.groupby('value')['timestamp'].first()
-
-    def time_head(self):
-        self.df.groupby('value')['timestamp'].head()
-
-    def time_last(self):
-        self.df.groupby('value')['timestamp'].last()
-
-    def time_mad(self):
-        self.df.groupby('value')['timestamp'].mad()
-
-    def time_max(self):
-        self.df.groupby('value')['timestamp'].max()
-
-    def time_mean(self):
-        self.df.groupby('value')['timestamp'].mean()
-
-    def time_median(self):
-        self.df.groupby('value')['timestamp'].median()
-
-    def time_min(self):
-        self.df.groupby('value')['timestamp'].min()
-
-    def time_nunique(self):
-        self.df.groupby('value')['timestamp'].nunique()
-
-    def time_pct_change(self):
-        self.df.groupby('value')['timestamp'].pct_change()
-
-    def time_prod(self):
-        self.df.groupby('value')['timestamp'].prod()
-
-    def time_rank(self):
-        self.df.groupby('value')['timestamp'].rank()
-
-    def time_sem(self):
-        self.df.groupby('value')['timestamp'].sem()
-
-    def time_size(self):
-        self.df.groupby('value')['timestamp'].size()
-
-    def time_skew(self):
-        self.df.groupby('value')['timestamp'].skew()
-
-    def time_std(self):
-        self.df.groupby('value')['timestamp'].std()
+class groupby_ngroups_float_100(groupby_ngroups_int_10000):
+    goal_time = 0.2
+    dtype = 'float'
+    ngroups = 100
 
-    def time_sum(self):
-        self.df.groupby('value')['timestamp'].sum()
+class groupby_ngroups_float_10000(groupby_ngroups_int_10000):
+    goal_time = 0.2
+    dtype = 'float'
+    ngroups = 10000
 
-    def time_tail(self):
-        self.df.groupby('value')['timestamp'].tail()
 
-    def time_unique(self):
-        self.df.groupby('value')['timestamp'].unique()
+class groupby_float32(object):
+    # GH 13335
+    goal_time = 0.2
 
-    def time_value_counts(self):
-        self.df.groupby('value')['timestamp'].value_counts()
+    def setup(self):
+        tmp1 = (np.random.random(10000) * 0.1).astype(np.float32)
+        tmp2 = (np.random.random(10000) * 10.0).astype(np.float32)
+        tmp = np.concatenate((tmp1, tmp2))
+        arr = np.repeat(tmp, 10)
+        self.df = DataFrame(dict(a=arr, b=arr))
 
-    def time_var(self):
-        self.df.groupby('value')['timestamp'].var()
+    def time_groupby_sum(self):
+        self.df.groupby(['a'])['b'].sum()
 
 
 #----------------------------------------------------------------------
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -486,4 +486,17 @@ def setup(self):
         self.midx = self.midx.take(np.random.permutation(np.arange(100000)))
 
     def time_sort_level_zero(self):
-        self.midx.sortlevel(0)
+        self.midx.sortlevel(0)
+
+class float_loc(object):
+    # GH 13166
+    goal_time = 0.2
+
+    def setup(self):
+        a = np.arange(100000)
+        self.ind = pd.Float64Index(a * 4.8000000418824129e-08)
+
+    def time_float_loc(self):
+        self.ind.get_loc(0)
+
+
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -345,7 +345,7 @@ Performance Improvements
 - Improved performance of sparse arithmetic with ``BlockIndex`` when the number of blocks are large, though recommended to use ``IntIndex`` in such cases (:issue:`13082`)
 - increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`)
 
-
+- Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`)
 - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`)
 
 
diff --git a/pandas/src/klib/khash_python.h b/pandas/src/klib/khash_python.h
@@ -2,9 +2,21 @@
 
 #include "khash.h"
 
-// kludge
-
-#define kh_float64_hash_func _Py_HashDouble
+// Previously we were using the built in cpython hash function for doubles
+// python 2.7 https://github.com/python/cpython/blob/2.7/Objects/object.c#L1021
+// python 3.5 https://github.com/python/cpython/blob/3.5/Python/pyhash.c#L85
+
+// The python 3 hash function has the invariant hash(x) == hash(int(x)) == hash(decimal(x))
+// and the size of hash may be different by platform / version (long in py2, Py_ssize_t in py3).
+// We don't need those invariants because types will be cast before hashing, and if Py_ssize_t
+// is 64 bits the truncation causes collission issues.  Given all that, we use our own
+// simple hash, viewing the double bytes as an int64 and using khash's default
+// hash for 64 bit integers.
+// GH 13436
+inline khint64_t asint64(double key) {
+  return *(khint64_t *)(&key);
+}
+#define kh_float64_hash_func(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
 #define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
 
 #define KHASH_MAP_INIT_FLOAT64(name, khval_t)								\