Skip to content

Commit 5e44098

Browse files
committed
introducing combined probing strategy: first linear probing, then second hash
1 parent c852b2e commit 5e44098

File tree

2 files changed

+44
-22
lines changed

2 files changed

+44
-22
lines changed

pandas/_libs/src/klib/khash.h

+41-19
Original file line numberDiff line numberDiff line change
@@ -218,9 +218,10 @@ khint32_t PANDAS_INLINE murmur2_64to32(khint64_t k){
218218
return murmur2_32_32to32(k1, k2);
219219
}
220220

221+
#define __ac_inc_linear(k, m) 1
221222

222223
#ifdef KHASH_LINEAR
223-
#define __ac_inc(k, m) 1
224+
#define __ac_inc(k, m) __ac_inc_linear(k, m)
224225
#else
225226
#define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m)
226227
#endif
@@ -248,7 +249,7 @@ static const double __ac_HASH_UPPER = 0.77;
248249
extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
249250
extern void kh_del_##name(kh_##name##_t *h, khint_t x);
250251

251-
#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
252+
#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal, LINEAR_STEPS) \
252253
typedef struct { \
253254
khint_t n_buckets, size, n_occupied, upper_bound; \
254255
khint32_t *flags; \
@@ -276,13 +277,18 @@ static const double __ac_HASH_UPPER = 0.77;
276277
SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
277278
{ \
278279
if (h->n_buckets) { \
279-
khint_t inc, k, i, last, mask; \
280+
khint_t inc, k, i, last, mask, cnt; \
280281
mask = h->n_buckets - 1; \
281282
k = __hash_func(key); i = k & mask; \
282-
inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
283+
cnt=0; \
284+
inc = __ac_inc_linear(k, mask); last = i; \
283285
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
286+
if (cnt == LINEAR_STEPS){ \
287+
inc = __ac_inc(k, mask); last = i; \
288+
} \
284289
i = (i + inc) & mask; \
285290
if (i == last) return h->n_buckets; \
291+
cnt++; \
286292
} \
287293
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
288294
} else return 0; \
@@ -314,11 +320,18 @@ static const double __ac_HASH_UPPER = 0.77;
314320
if (kh_is_map) val = h->vals[j]; \
315321
__ac_set_isempty_true(h->flags, j); \
316322
while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
317-
khint_t inc, k, i; \
323+
khint_t inc, k, i, cnt; \
318324
k = __hash_func(key); \
319325
i = k & new_mask; \
320-
inc = __ac_inc(k, new_mask); \
321-
while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
326+
cnt=0; \
327+
inc = __ac_inc_linear(k, new_mask); \
328+
while (!__ac_isempty(new_flags, i)){ \
329+
if (cnt == LINEAR_STEPS){ \
330+
inc = __ac_inc(k, new_mask); \
331+
} \
332+
i = (i + inc) & new_mask; \
333+
cnt++; \
334+
} \
322335
__ac_set_isempty_false(new_flags, i); \
323336
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
324337
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
@@ -351,14 +364,19 @@ static const double __ac_HASH_UPPER = 0.77;
351364
else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \
352365
} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
353366
{ \
354-
khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \
367+
khint_t inc, k, i, site, last, cnt, mask = h->n_buckets - 1; \
355368
x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
356369
if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
357370
else { \
358-
inc = __ac_inc(k, mask); last = i; \
371+
cnt = 0; \
372+
inc = __ac_inc_linear(k, mask); last = i; \
359373
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
374+
if (cnt == LINEAR_STEPS){ \
375+
inc = __ac_inc(k, mask); last = i; \
376+
} \
360377
if (__ac_isdel(h->flags, i)) site = i; \
361378
i = (i + inc) & mask; \
379+
cnt++; \
362380
if (i == last) { x = site; break; } \
363381
} \
364382
if (x == h->n_buckets) { \
@@ -388,8 +406,8 @@ static const double __ac_HASH_UPPER = 0.77;
388406
} \
389407
}
390408

391-
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
392-
KHASH_INIT2(name, PANDAS_INLINE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
409+
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal, LINEAR_STEPS) \
410+
KHASH_INIT2(name, PANDAS_INLINE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal, LINEAR_STEPS)
393411

394412
/* --- BEGIN OF HASH FUNCTIONS --- */
395413

@@ -576,41 +594,45 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key)
576594

577595
/* More convenient interfaces */
578596

597+
#define LINEAR_STEPS_BIG_OBJECT 0
598+
#define LINEAR_STEPS_32BIT 3
599+
#define LINEAR_STEPS_64BIT 3
600+
579601
/*! @function
580602
@abstract Instantiate a hash set containing integer keys
581603
@param name Name of the hash table [symbol]
582604
*/
583605
#define KHASH_SET_INIT_INT(name) \
584-
KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
606+
KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal, LINEAR_STEPS_32BIT)
585607

586608
/*! @function
587609
@abstract Instantiate a hash map containing integer keys
588610
@param name Name of the hash table [symbol]
589611
@param khval_t Type of values [type]
590612
*/
591613
#define KHASH_MAP_INIT_INT(name, khval_t) \
592-
KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
614+
KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal, LINEAR_STEPS_32BIT)
593615

594616
/*! @function
595617
@abstract Instantiate a hash map containing 64-bit integer keys
596618
@param name Name of the hash table [symbol]
597619
*/
598620
#define KHASH_SET_INIT_UINT64(name) \
599-
KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
621+
KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal, LINEAR_STEPS_64BIT)
600622

601623
#define KHASH_SET_INIT_INT64(name) \
602-
KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
624+
KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal, LINEAR_STEPS_64BIT)
603625

604626
/*! @function
605627
@abstract Instantiate a hash map containing 64-bit integer keys
606628
@param name Name of the hash table [symbol]
607629
@param khval_t Type of values [type]
608630
*/
609631
#define KHASH_MAP_INIT_UINT64(name, khval_t) \
610-
KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
632+
KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal, LINEAR_STEPS_64BIT)
611633

612634
#define KHASH_MAP_INIT_INT64(name, khval_t) \
613-
KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
635+
KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal, LINEAR_STEPS_64BIT)
614636

615637

616638
typedef const char *kh_cstr_t;
@@ -619,15 +641,15 @@ typedef const char *kh_cstr_t;
619641
@param name Name of the hash table [symbol]
620642
*/
621643
#define KHASH_SET_INIT_STR(name) \
622-
KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
644+
KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal, LINEAR_STEPS_BIG_OBJECT)
623645

624646
/*! @function
625647
@abstract Instantiate a hash map containing const char* keys
626648
@param name Name of the hash table [symbol]
627649
@param khval_t Type of values [type]
628650
*/
629651
#define KHASH_MAP_INIT_STR(name, khval_t) \
630-
KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
652+
KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal, LINEAR_STEPS_BIG_OBJECT)
631653

632654

633655
#define kh_exist_str(h, k) (kh_exist(h, k))

pandas/_libs/src/klib/khash_python.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ khint32_t PANDAS_INLINE kh_float64_hash_func(double val){
4242
#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
4343

4444
#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \
45-
KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_float64_hash_equal)
45+
KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_float64_hash_equal, LINEAR_STEPS_64BIT)
4646

4747
KHASH_MAP_INIT_FLOAT64(float64, size_t)
4848

@@ -76,13 +76,13 @@ typedef PyObject* kh_pyobject_t;
7676

7777
#define KHASH_MAP_INIT_PYOBJECT(name, khval_t) \
7878
KHASH_INIT(name, kh_pyobject_t, khval_t, 1, \
79-
kh_python_hash_func, kh_python_hash_equal)
79+
kh_python_hash_func, kh_python_hash_equal, LINEAR_STEPS_BIG_OBJECT)
8080

8181
KHASH_MAP_INIT_PYOBJECT(pymap, Py_ssize_t)
8282

8383
#define KHASH_SET_INIT_PYOBJECT(name) \
8484
KHASH_INIT(name, kh_pyobject_t, char, 0, \
85-
kh_python_hash_func, kh_python_hash_equal)
85+
kh_python_hash_func, kh_python_hash_equal, LINEAR_STEPS_BIG_OBJECT)
8686

8787
KHASH_SET_INIT_PYOBJECT(pyset)
8888

0 commit comments

Comments
 (0)