From f60d0c3b3ae061df95a5b2a1b5eb3dafb6923a37 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 19 Oct 2022 21:17:01 -0700 Subject: [PATCH] Vendored klib quadtric probing --- pandas/_libs/src/klib/khash.h | 37 ++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index e17d82d51f0fb..6bc560c5ce576 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -47,6 +47,16 @@ int main() { */ /* + 2013-05-02 (0.2.8): + * Use quadratic probing. When the capacity is power of 2, stepping function + i*(i+1)/2 guarantees to traverse each bucket. It is better than double + hashing on cache performance and is more robust than linear probing. + In theory, double hashing should be more robust than quadratic probing. + However, my implementation is probably not for large hash tables, because + the second hash function is closely tied to the first hash function, + which reduce the effectiveness of double hashing. + Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php + 2011-09-16 (0.2.6): * The capacity is a power of 2. This seems to dramatically improve the @@ -107,7 +117,7 @@ int main() { Generic hash table library. */ -#define AC_VERSION_KHASH_H "0.2.6" +#define AC_VERSION_KHASH_H "0.2.8" #include #include @@ -177,7 +187,6 @@ typedef khuint_t khiter_t; #define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i) #define __ac_set_isdel_true(flag, i) ((void)0) - // specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp khuint32_t PANDAS_INLINE murmur2_32to32(khuint32_t k){ const khuint32_t SEED = 0xc70f6907UL; @@ -252,13 +261,6 @@ khuint32_t PANDAS_INLINE murmur2_64to32(khuint64_t k){ return murmur2_32_32to32(k1, k2); } - -#ifdef KHASH_LINEAR -#define __ac_inc(k, m) 1 -#else -#define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m) -#endif - #define __ac_fsize(m) ((m) < 32? 1 : (m)>>5) #ifndef kroundup32 @@ -310,12 +312,12 @@ static const double __ac_HASH_UPPER = 0.77; SCOPE khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ - khuint_t inc, k, i, last, mask; \ + khuint_t k, i, last, mask, step=0;\ mask = h->n_buckets - 1; \ k = __hash_func(key); i = k & mask; \ - inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ + last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - i = (i + inc) & mask; \ + i = (i + ++step) & mask; \ if (i == last) return h->n_buckets; \ } \ return __ac_iseither(h->flags, i)? h->n_buckets : i; \ @@ -348,11 +350,10 @@ static const double __ac_HASH_UPPER = 0.77; if (kh_is_map) val = h->vals[j]; \ __ac_set_isempty_true(h->flags, j); \ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ - khuint_t inc, k, i; \ + khuint_t k, i, step=0;\ k = __hash_func(key); \ i = k & new_mask; \ - inc = __ac_inc(k, new_mask); \ - while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ + while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ __ac_set_isempty_false(new_flags, i); \ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ @@ -385,14 +386,14 @@ static const double __ac_HASH_UPPER = 0.77; else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ { \ - khuint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ + khuint_t k, i, site, last, mask = h->n_buckets - 1, step=0;\ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ else { \ - inc = __ac_inc(k, mask); last = i; \ + last = i ; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (__ac_isdel(h->flags, i)) site = i; \ - i = (i + inc) & mask; \ + i = (i + (++step)) & mask; \ if (i == last) { x = site; break; } \ } \ if (x == h->n_buckets) { \