[PERF] using murmur2 hash for float64 klib-hash-tables

realead · realead · commit 4e89fce6cc05 · 2020-10-13T21:37:54.000+02:00
diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h
@@ -171,6 +171,54 @@ khint32_t PANDAS_INLINE murmur2_32to32(khint32_t k){
     return h;
 }
 
+// it is possible to have a special x64-version, which would need less operations, but 
+// using 32bit version always has also some benifits:
+//    - one code for 32bit and 64bit builds
+//    - the same case for 32bit and 64bit builds
+//    - no performance difference could be measured compared to a possible x64-version
+
+khint32_t PANDAS_INLINE murmur2_32_32to32(khint32_t k1, khint32_t k2){
+    const khint32_t SEED = 0xc70f6907UL;
+    // 'm' and 'r' are mixing constants generated offline.
+    // They're not really 'magic', they just happen to work well.
+    const khint32_t M_32 = 0x5bd1e995;
+    const int R_32 = 24;
+
+    // Initialize the hash to a 'random' value
+    khint32_t h = SEED ^ 4;
+
+    //handle first 4 bytes:
+    k1 *= M_32;
+    k1 ^= k1 >> R_32;
+    k1 *= M_32;
+
+    h *= M_32;
+    h ^= k1;
+
+    //handle second 4 bytes:
+    k2 *= M_32;
+    k2 ^= k2 >> R_32;
+    k2 *= M_32;
+
+    h *= M_32;
+    h ^= k2;
+
+    // Do a few final mixes of the hash to ensure the "last few
+    // bytes" are well-incorporated.
+    h ^= h >> 13;
+    h *= M_32;
+    h ^= h >> 15;
+    return h;
+}
+
+khint32_t PANDAS_INLINE murmur2_64to32(khint64_t k){
+    khint32_t k1 = (khint32_t)k;
+    khint32_t k2 = (khint32_t)(k >> 32);
+
+    return murmur2_32_32to32(k1, k2);
+}
+
+
 #ifdef KHASH_LINEAR
 #define __ac_inc(k, m) 1
 #else
diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h
@@ -13,25 +13,31 @@
 // is 64 bits the truncation causes collission issues.  Given all that, we use our own
 // simple hash, viewing the double bytes as an int64 and using khash's default
 // hash for 64 bit integers.
-// GH 13436
+// GH 13436 showed that _Py_HashDouble doesn't work well with khash
+// GH 28303 showed, that the simple xoring-version isn't good enough
+// thus murmur2-hash is used
+
 khint64_t PANDAS_INLINE asint64(double key) {
-  khint64_t val;
-  memcpy(&val, &key, sizeof(double));
-  return val;
+    khint64_t val;
+    memcpy(&val, &key, sizeof(double));
+    return val;
 }
 
-// correct for all inputs but not -0.0 and NaNs
-#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
-
-// correct for all inputs but not NaNs
-#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ?                       \
-                                        kh_float64_hash_func_0_NAN(0.0) : \
-                                        kh_float64_hash_func_0_NAN(key))
+#define ZERO_HASH 0
+#define NAN_HASH  0
 
-// correct for all
-#define kh_float64_hash_func(key) ((key) != (key) ?                       \
-                                   kh_float64_hash_func_NAN(Py_NAN) :     \
-                                   kh_float64_hash_func_NAN(key))
+khint32_t PANDAS_INLINE kh_float64_hash_func(double val){
+    // 0.0 and -0.0 should have the same hash:
+    if (val == 0.0){
+        return ZERO_HASH;
+    }
+    // all nans should have the same hash:
+    if ( val!=val ){
+        return NAN_HASH;
+    }
+    khint64_t as_int = asint64(val);
+    return murmur2_64to32(as_int);
+}
 
 #define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))