smaller benches, explanatory comment

chris-b1 · chris-b1 · commit 3aec078d4867 · 2016-06-14T19:59:44.000-05:00
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -515,7 +515,7 @@ def setup(self):
         tmp1 = (np.random.random(10000) * 0.1).astype(np.float32)
         tmp2 = (np.random.random(10000) * 10.0).astype(np.float32)
         tmp = np.concatenate((tmp1, tmp2))
-        arr = np.repeat(tmp, 100)
+        arr = np.repeat(tmp, 10)
         self.df = DataFrame(dict(a=arr, b=arr))
 
     def time_groupby_sum(self):
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -493,7 +493,7 @@ class float_loc(object):
     goal_time = 0.2
 
     def setup(self):
-        a = np.arange(1000000)
+        a = np.arange(100000)
         self.ind = pd.Float64Index(a * 4.8000000418824129e-08)
 
     def time_float_loc(self):
diff --git a/pandas/src/klib/khash_python.h b/pandas/src/klib/khash_python.h
@@ -2,6 +2,17 @@
 
 #include "khash.h"
 
+// Previously we were using the built in cpython hash function for doubles
+// python 2.7 https://github.com/python/cpython/blob/2.7/Objects/object.c#L1021
+// python 3.5 https://github.com/python/cpython/blob/3.5/Python/pyhash.c#L85
+
+// The python 3 hash function has the invariant hash(x) == hash(int(x)) == hash(decimal(x))
+// and the size of hash may be different by platform / version (long in py2, Py_ssize_t in py3).
+// We don't need those invariants because types will be cast before hashing, and if Py_ssize_t
+// is 64 bits the truncation causes collission issues.  Given all that, we use our own
+// simple hash, viewing the double bytes as an int64 and using khash's default
+// hash for 64 bit integers.
+// GH 13436
 inline khint64_t asint64(double key) {
   return *(khint64_t *)(&key);
 }