Skip to content

Commit 4e89fce

Browse files
committed
[PERF] using murmur2 hash for float64 klib-hash-tables
1 parent 18cf44a commit 4e89fce

File tree

2 files changed

+69
-15
lines changed

2 files changed

+69
-15
lines changed

pandas/_libs/src/klib/khash.h

+48
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,54 @@ khint32_t PANDAS_INLINE murmur2_32to32(khint32_t k){
171171
return h;
172172
}
173173

174+
// it is possible to have a special x64-version, which would need less operations, but
175+
// using 32bit version always has also some benifits:
176+
// - one code for 32bit and 64bit builds
177+
// - the same case for 32bit and 64bit builds
178+
// - no performance difference could be measured compared to a possible x64-version
179+
180+
khint32_t PANDAS_INLINE murmur2_32_32to32(khint32_t k1, khint32_t k2){
181+
const khint32_t SEED = 0xc70f6907UL;
182+
// 'm' and 'r' are mixing constants generated offline.
183+
// They're not really 'magic', they just happen to work well.
184+
const khint32_t M_32 = 0x5bd1e995;
185+
const int R_32 = 24;
186+
187+
// Initialize the hash to a 'random' value
188+
khint32_t h = SEED ^ 4;
189+
190+
//handle first 4 bytes:
191+
k1 *= M_32;
192+
k1 ^= k1 >> R_32;
193+
k1 *= M_32;
194+
195+
h *= M_32;
196+
h ^= k1;
197+
198+
//handle second 4 bytes:
199+
k2 *= M_32;
200+
k2 ^= k2 >> R_32;
201+
k2 *= M_32;
202+
203+
h *= M_32;
204+
h ^= k2;
205+
206+
// Do a few final mixes of the hash to ensure the "last few
207+
// bytes" are well-incorporated.
208+
h ^= h >> 13;
209+
h *= M_32;
210+
h ^= h >> 15;
211+
return h;
212+
}
213+
214+
khint32_t PANDAS_INLINE murmur2_64to32(khint64_t k){
215+
khint32_t k1 = (khint32_t)k;
216+
khint32_t k2 = (khint32_t)(k >> 32);
217+
218+
return murmur2_32_32to32(k1, k2);
219+
}
220+
221+
174222
#ifdef KHASH_LINEAR
175223
#define __ac_inc(k, m) 1
176224
#else

pandas/_libs/src/klib/khash_python.h

+21-15
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,31 @@
1313
// is 64 bits the truncation causes collission issues. Given all that, we use our own
1414
// simple hash, viewing the double bytes as an int64 and using khash's default
1515
// hash for 64 bit integers.
16-
// GH 13436
16+
// GH 13436 showed that _Py_HashDouble doesn't work well with khash
17+
// GH 28303 showed, that the simple xoring-version isn't good enough
18+
// thus murmur2-hash is used
19+
1720
khint64_t PANDAS_INLINE asint64(double key) {
18-
khint64_t val;
19-
memcpy(&val, &key, sizeof(double));
20-
return val;
21+
khint64_t val;
22+
memcpy(&val, &key, sizeof(double));
23+
return val;
2124
}
2225

23-
// correct for all inputs but not -0.0 and NaNs
24-
#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
25-
26-
// correct for all inputs but not NaNs
27-
#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ? \
28-
kh_float64_hash_func_0_NAN(0.0) : \
29-
kh_float64_hash_func_0_NAN(key))
26+
#define ZERO_HASH 0
27+
#define NAN_HASH 0
3028

31-
// correct for all
32-
#define kh_float64_hash_func(key) ((key) != (key) ? \
33-
kh_float64_hash_func_NAN(Py_NAN) : \
34-
kh_float64_hash_func_NAN(key))
29+
khint32_t PANDAS_INLINE kh_float64_hash_func(double val){
30+
// 0.0 and -0.0 should have the same hash:
31+
if (val == 0.0){
32+
return ZERO_HASH;
33+
}
34+
// all nans should have the same hash:
35+
if ( val!=val ){
36+
return NAN_HASH;
37+
}
38+
khint64_t as_int = asint64(val);
39+
return murmur2_64to32(as_int);
40+
}
3541

3642
#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
3743

0 commit comments

Comments
 (0)