From 9200cd80c27c25304313bc98e1167d4b1e398774 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 22 Nov 2022 19:36:21 -0800 Subject: [PATCH] Revert "Vendored klib quadatric probing (#49197)" This reverts commit 2317bf08ed172048a66a0533645ebc1886939417. --- pandas/_libs/src/klib/khash.h | 37 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index 6bc560c5ce576..e17d82d51f0fb 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -47,16 +47,6 @@ int main() { */ /* - 2013-05-02 (0.2.8): - * Use quadratic probing. When the capacity is power of 2, stepping function - i*(i+1)/2 guarantees to traverse each bucket. It is better than double - hashing on cache performance and is more robust than linear probing. - In theory, double hashing should be more robust than quadratic probing. - However, my implementation is probably not for large hash tables, because - the second hash function is closely tied to the first hash function, - which reduce the effectiveness of double hashing. - Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php - 2011-09-16 (0.2.6): * The capacity is a power of 2. This seems to dramatically improve the @@ -117,7 +107,7 @@ int main() { Generic hash table library. */ -#define AC_VERSION_KHASH_H "0.2.8" +#define AC_VERSION_KHASH_H "0.2.6" #include #include @@ -187,6 +177,7 @@ typedef khuint_t khiter_t; #define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i) #define __ac_set_isdel_true(flag, i) ((void)0) + // specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp khuint32_t PANDAS_INLINE murmur2_32to32(khuint32_t k){ const khuint32_t SEED = 0xc70f6907UL; @@ -261,6 +252,13 @@ khuint32_t PANDAS_INLINE murmur2_64to32(khuint64_t k){ return murmur2_32_32to32(k1, k2); } + +#ifdef KHASH_LINEAR +#define __ac_inc(k, m) 1 +#else +#define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m) +#endif + #define __ac_fsize(m) ((m) < 32? 1 : (m)>>5) #ifndef kroundup32 @@ -312,12 +310,12 @@ static const double __ac_HASH_UPPER = 0.77; SCOPE khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ - khuint_t k, i, last, mask, step=0;\ + khuint_t inc, k, i, last, mask; \ mask = h->n_buckets - 1; \ k = __hash_func(key); i = k & mask; \ - last = i; \ + inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - i = (i + ++step) & mask; \ + i = (i + inc) & mask; \ if (i == last) return h->n_buckets; \ } \ return __ac_iseither(h->flags, i)? h->n_buckets : i; \ @@ -350,10 +348,11 @@ static const double __ac_HASH_UPPER = 0.77; if (kh_is_map) val = h->vals[j]; \ __ac_set_isempty_true(h->flags, j); \ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ - khuint_t k, i, step=0;\ + khuint_t inc, k, i; \ k = __hash_func(key); \ i = k & new_mask; \ - while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ + inc = __ac_inc(k, new_mask); \ + while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ __ac_set_isempty_false(new_flags, i); \ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ @@ -386,14 +385,14 @@ static const double __ac_HASH_UPPER = 0.77; else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ { \ - khuint_t k, i, site, last, mask = h->n_buckets - 1, step=0;\ + khuint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ else { \ - last = i ; \ + inc = __ac_inc(k, mask); last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (__ac_isdel(h->flags, i)) site = i; \ - i = (i + (++step)) & mask; \ + i = (i + inc) & mask; \ if (i == last) { x = site; break; } \ } \ if (x == h->n_buckets) { \