From 339ad1a3ee84c93a2f691d4d19d86860008bf3fd Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Mon, 13 Jun 2016 21:00:25 -0500
Subject: [PATCH 1/2] PERF: float hash slow in py3

---
 asv_bench/benchmarks/groupby.py  | 137 ++++++++-----------------------
 asv_bench/benchmarks/indexing.py |  15 +++-
 doc/source/whatsnew/v0.18.2.txt  |   2 +-
 pandas/src/klib/khash_python.h   |   7 +-
 4 files changed, 55 insertions(+), 106 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 586bd00b091fe..c9ee46d9bd531 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -379,15 +379,24 @@ def time_groupby_dt_timegrouper_size(self):
 #----------------------------------------------------------------------
 # groupby with a variable value for ngroups
 
-class groupby_ngroups_10000(object):
+class groupby_ngroups_int_10000(object):
     goal_time = 0.2
+    dtype = 'int'
+    ngroups = 10000
 
     def setup(self):
         np.random.seed(1234)
-        self.ngroups = 10000
-        self.size = (self.ngroups * 2)
-        self.rng = np.arange(self.ngroups)
-        self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
+        size = self.ngroups * 2
+        rng = np.arange(self.ngroups)
+        ts = rng.take(np.random.randint(0, self.ngroups, size=size))
+        if self.dtype == 'int':
+            value = np.random.randint(0, size, size=size)
+        else:
+            value = np.concatenate([np.random.random(self.ngroups) * 0.1,
+                                    np.random.random(self.ngroups) * 10.0])
+
+        self.df = DataFrame({'timestamp': ts,
+                             'value': value})
 
     def time_all(self):
         self.df.groupby('value')['timestamp'].all()
@@ -482,109 +491,35 @@ def time_value_counts(self):
     def time_var(self):
         self.df.groupby('value')['timestamp'].var()
 
-
-class groupby_ngroups_100(object):
+class groupby_ngroups_int_100(groupby_ngroups_int_10000):
     goal_time = 0.2
+    dtype = 'int'
+    ngroups = 100
 
-    def setup(self):
-        np.random.seed(1234)
-        self.ngroups = 100
-        self.size = (self.ngroups * 2)
-        self.rng = np.arange(self.ngroups)
-        self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size)))
-
-    def time_all(self):
-        self.df.groupby('value')['timestamp'].all()
-
-    def time_any(self):
-        self.df.groupby('value')['timestamp'].any()
-
-    def time_count(self):
-        self.df.groupby('value')['timestamp'].count()
-
-    def time_cumcount(self):
-        self.df.groupby('value')['timestamp'].cumcount()
-
-    def time_cummax(self):
-        self.df.groupby('value')['timestamp'].cummax()
-
-    def time_cummin(self):
-        self.df.groupby('value')['timestamp'].cummin()
-
-    def time_cumprod(self):
-        self.df.groupby('value')['timestamp'].cumprod()
-
-    def time_cumsum(self):
-        self.df.groupby('value')['timestamp'].cumsum()
-
-    def time_describe(self):
-        self.df.groupby('value')['timestamp'].describe()
-
-    def time_diff(self):
-        self.df.groupby('value')['timestamp'].diff()
-
-    def time_first(self):
-        self.df.groupby('value')['timestamp'].first()
-
-    def time_head(self):
-        self.df.groupby('value')['timestamp'].head()
-
-    def time_last(self):
-        self.df.groupby('value')['timestamp'].last()
-
-    def time_mad(self):
-        self.df.groupby('value')['timestamp'].mad()
-
-    def time_max(self):
-        self.df.groupby('value')['timestamp'].max()
-
-    def time_mean(self):
-        self.df.groupby('value')['timestamp'].mean()
-
-    def time_median(self):
-        self.df.groupby('value')['timestamp'].median()
-
-    def time_min(self):
-        self.df.groupby('value')['timestamp'].min()
-
-    def time_nunique(self):
-        self.df.groupby('value')['timestamp'].nunique()
-
-    def time_pct_change(self):
-        self.df.groupby('value')['timestamp'].pct_change()
-
-    def time_prod(self):
-        self.df.groupby('value')['timestamp'].prod()
-
-    def time_rank(self):
-        self.df.groupby('value')['timestamp'].rank()
-
-    def time_sem(self):
-        self.df.groupby('value')['timestamp'].sem()
-
-    def time_size(self):
-        self.df.groupby('value')['timestamp'].size()
-
-    def time_skew(self):
-        self.df.groupby('value')['timestamp'].skew()
-
-    def time_std(self):
-        self.df.groupby('value')['timestamp'].std()
+class groupby_ngroups_float_100(groupby_ngroups_int_10000):
+    goal_time = 0.2
+    dtype = 'float'
+    ngroups = 100
 
-    def time_sum(self):
-        self.df.groupby('value')['timestamp'].sum()
+class groupby_ngroups_float_10000(groupby_ngroups_int_10000):
+    goal_time = 0.2
+    dtype = 'float'
+    ngroups = 10000
 
-    def time_tail(self):
-        self.df.groupby('value')['timestamp'].tail()
 
-    def time_unique(self):
-        self.df.groupby('value')['timestamp'].unique()
+class groupby_float32(object):
+    # GH 13335
+    goal_time = 0.2
 
-    def time_value_counts(self):
-        self.df.groupby('value')['timestamp'].value_counts()
+    def setup(self):
+        tmp1 = (np.random.random(10000) * 0.1).astype(np.float32)
+        tmp2 = (np.random.random(10000) * 10.0).astype(np.float32)
+        tmp = np.concatenate((tmp1, tmp2))
+        arr = np.repeat(tmp, 100)
+        self.df = DataFrame(dict(a=arr, b=arr))
 
-    def time_var(self):
-        self.df.groupby('value')['timestamp'].var()
+    def time_groupby_sum(self):
+        self.df.groupby(['a'])['b'].sum()
 
 
 #----------------------------------------------------------------------
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
index 32d80a7913234..dc15d9e25e481 100644
--- a/asv_bench/benchmarks/indexing.py
+++ b/asv_bench/benchmarks/indexing.py
@@ -486,4 +486,17 @@ def setup(self):
         self.midx = self.midx.take(np.random.permutation(np.arange(100000)))
 
     def time_sort_level_zero(self):
-        self.midx.sortlevel(0)
\ No newline at end of file
+        self.midx.sortlevel(0)
+
+class float_loc(object):
+    # GH 13166
+    goal_time = 0.2
+
+    def setup(self):
+        a = np.arange(1000000)
+        self.ind = pd.Float64Index(a * 4.8000000418824129e-08)
+
+    def time_float_loc(self):
+        self.ind.get_loc(0)
+
+
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
index 105194e504f45..0b8b7b56fd36b 100644
--- a/doc/source/whatsnew/v0.18.2.txt
+++ b/doc/source/whatsnew/v0.18.2.txt
@@ -307,7 +307,7 @@ Performance Improvements
 - Improved performance of sparse arithmetic with ``BlockIndex`` when the number of blocks are large, though recommended to use ``IntIndex`` in such cases (:issue:`13082`)
 - increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`)
 
-
+- Improved performance of float64 hash table fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`)
 - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`)
 
 
diff --git a/pandas/src/klib/khash_python.h b/pandas/src/klib/khash_python.h
index cdd94b5d8522f..d9fc012df6785 100644
--- a/pandas/src/klib/khash_python.h
+++ b/pandas/src/klib/khash_python.h
@@ -2,9 +2,10 @@
 
 #include "khash.h"
 
-// kludge
-
-#define kh_float64_hash_func _Py_HashDouble
+inline khint64_t asint64(double key) {
+  return *(khint64_t *)(&key);
+}
+#define kh_float64_hash_func(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
 #define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
 
 #define KHASH_MAP_INIT_FLOAT64(name, khval_t)								\

From 3aec078d48670c6e3db6838f8b41667bf8b1088f Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Tue, 14 Jun 2016 19:59:44 -0500
Subject: [PATCH 2/2] smaller benches, explanatory comment

---
 asv_bench/benchmarks/groupby.py  |  2 +-
 asv_bench/benchmarks/indexing.py |  2 +-
 pandas/src/klib/khash_python.h   | 11 +++++++++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index c9ee46d9bd531..0611a3564ff7a 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -515,7 +515,7 @@ def setup(self):
         tmp1 = (np.random.random(10000) * 0.1).astype(np.float32)
         tmp2 = (np.random.random(10000) * 10.0).astype(np.float32)
         tmp = np.concatenate((tmp1, tmp2))
-        arr = np.repeat(tmp, 100)
+        arr = np.repeat(tmp, 10)
         self.df = DataFrame(dict(a=arr, b=arr))
 
     def time_groupby_sum(self):
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
index dc15d9e25e481..53d37a8161f43 100644
--- a/asv_bench/benchmarks/indexing.py
+++ b/asv_bench/benchmarks/indexing.py
@@ -493,7 +493,7 @@ class float_loc(object):
     goal_time = 0.2
 
     def setup(self):
-        a = np.arange(1000000)
+        a = np.arange(100000)
         self.ind = pd.Float64Index(a * 4.8000000418824129e-08)
 
     def time_float_loc(self):
diff --git a/pandas/src/klib/khash_python.h b/pandas/src/klib/khash_python.h
index d9fc012df6785..7684493d08855 100644
--- a/pandas/src/klib/khash_python.h
+++ b/pandas/src/klib/khash_python.h
@@ -2,6 +2,17 @@
 
 #include "khash.h"
 
+// Previously we were using the built in cpython hash function for doubles
+// python 2.7 https://github.com/python/cpython/blob/2.7/Objects/object.c#L1021
+// python 3.5 https://github.com/python/cpython/blob/3.5/Python/pyhash.c#L85
+
+// The python 3 hash function has the invariant hash(x) == hash(int(x)) == hash(decimal(x))
+// and the size of hash may be different by platform / version (long in py2, Py_ssize_t in py3).
+// We don't need those invariants because types will be cast before hashing, and if Py_ssize_t
+// is 64 bits the truncation causes collission issues.  Given all that, we use our own
+// simple hash, viewing the double bytes as an int64 and using khash's default
+// hash for 64 bit integers.
+// GH 13436
 inline khint64_t asint64(double key) {
   return *(khint64_t *)(&key);
 }