Skip to content

Vendored klib quadatric probing #49197

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 26, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 19 additions & 18 deletions pandas/_libs/src/klib/khash.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,16 @@ int main() {
*/

/*
2013-05-02 (0.2.8):
* Use quadratic probing. When the capacity is power of 2, stepping function
i*(i+1)/2 guarantees to traverse each bucket. It is better than double
hashing on cache performance and is more robust than linear probing.
In theory, double hashing should be more robust than quadratic probing.
However, my implementation is probably not for large hash tables, because
the second hash function is closely tied to the first hash function,
which reduce the effectiveness of double hashing.
Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php

2011-09-16 (0.2.6):

* The capacity is a power of 2. This seems to dramatically improve the
Expand Down Expand Up @@ -107,7 +117,7 @@ int main() {
Generic hash table library.
*/

#define AC_VERSION_KHASH_H "0.2.6"
#define AC_VERSION_KHASH_H "0.2.8"

#include <stdlib.h>
#include <string.h>
Expand Down Expand Up @@ -177,7 +187,6 @@ typedef khuint_t khiter_t;
#define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i)
#define __ac_set_isdel_true(flag, i) ((void)0)


// specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp
khuint32_t PANDAS_INLINE murmur2_32to32(khuint32_t k){
const khuint32_t SEED = 0xc70f6907UL;
Expand Down Expand Up @@ -252,13 +261,6 @@ khuint32_t PANDAS_INLINE murmur2_64to32(khuint64_t k){
return murmur2_32_32to32(k1, k2);
}


#ifdef KHASH_LINEAR
#define __ac_inc(k, m) 1
#else
#define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m)
#endif

#define __ac_fsize(m) ((m) < 32? 1 : (m)>>5)

#ifndef kroundup32
Expand Down Expand Up @@ -310,12 +312,12 @@ static const double __ac_HASH_UPPER = 0.77;
SCOPE khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
{ \
if (h->n_buckets) { \
khuint_t inc, k, i, last, mask; \
khuint_t k, i, last, mask, step=0;\
mask = h->n_buckets - 1; \
k = __hash_func(key); i = k & mask; \
inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
last = i; \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
i = (i + inc) & mask; \
i = (i + ++step) & mask; \
if (i == last) return h->n_buckets; \
} \
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
Expand Down Expand Up @@ -348,11 +350,10 @@ static const double __ac_HASH_UPPER = 0.77;
if (kh_is_map) val = h->vals[j]; \
__ac_set_isempty_true(h->flags, j); \
while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
khuint_t inc, k, i; \
khuint_t k, i, step=0;\
k = __hash_func(key); \
i = k & new_mask; \
inc = __ac_inc(k, new_mask); \
while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
__ac_set_isempty_false(new_flags, i); \
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
Expand Down Expand Up @@ -385,14 +386,14 @@ static const double __ac_HASH_UPPER = 0.77;
else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \
} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
{ \
khuint_t inc, k, i, site, last, mask = h->n_buckets - 1; \
khuint_t k, i, site, last, mask = h->n_buckets - 1, step=0;\
x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
else { \
inc = __ac_inc(k, mask); last = i; \
last = i ; \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
if (__ac_isdel(h->flags, i)) site = i; \
i = (i + inc) & mask; \
i = (i + (++step)) & mask; \
if (i == last) { x = site; break; } \
} \
if (x == h->n_buckets) { \
Expand Down