Skip to content

Commit 596ca32

Browse files
committed
ENH: add AC's khash.h to start tinkering with
1 parent a57bd48 commit 596ca32

File tree

2 files changed

+327
-0
lines changed

2 files changed

+327
-0
lines changed

pandas/src/khash.h

Lines changed: 326 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,326 @@
1+
/* The MIT License
2+
3+
Copyright (c) 2008, by Attractive Chaos <[email protected]>
4+
5+
Permission is hereby granted, free of charge, to any person obtaining
6+
a copy of this software and associated documentation files (the
7+
"Software"), to deal in the Software without restriction, including
8+
without limitation the rights to use, copy, modify, merge, publish,
9+
distribute, sublicense, and/or sell copies of the Software, and to
10+
permit persons to whom the Software is furnished to do so, subject to
11+
the following conditions:
12+
13+
The above copyright notice and this permission notice shall be
14+
included in all copies or substantial portions of the Software.
15+
16+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20+
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23+
SOFTWARE.
24+
*/
25+
26+
/*
27+
An example:
28+
29+
#include "khash.h"
30+
KHASH_MAP_INIT_INT(32, char)
31+
int main() {
32+
int ret, is_missing;
33+
khiter_t k;
34+
khash_t(32) *h = kh_init(32);
35+
k = kh_put(32, h, 5, &ret);
36+
if (!ret) kh_del(32, h, k);
37+
kh_value(h, k) = 10;
38+
k = kh_get(32, h, 10);
39+
is_missing = (k == kh_end(h));
40+
k = kh_get(32, h, 5);
41+
kh_del(32, h, k);
42+
for (k = kh_begin(h); k != kh_end(h); ++k)
43+
if (kh_exist(h, k)) kh_value(h, k) = 1;
44+
kh_destroy(32, h);
45+
return 0;
46+
}
47+
*/
48+
49+
/*
50+
2008-09-19 (0.2.3):
51+
52+
* Corrected the example
53+
* Improved interfaces
54+
55+
2008-09-11 (0.2.2):
56+
57+
* Improved speed a little in kh_put()
58+
59+
2008-09-10 (0.2.1):
60+
61+
* Added kh_clear()
62+
* Fixed a compiling error
63+
64+
2008-09-02 (0.2.0):
65+
66+
* Changed to token concatenation which increases flexibility.
67+
68+
2008-08-31 (0.1.2):
69+
70+
* Fixed a bug in kh_get(), which has not been tested previously.
71+
72+
2008-08-31 (0.1.1):
73+
74+
* Added destructor
75+
*/
76+
77+
78+
#ifndef __AC_KHASH_H
79+
#define __AC_KHASH_H
80+
81+
#define AC_VERSION_KHASH_H "0.2.2"
82+
83+
#include <stdint.h>
84+
#include <stdlib.h>
85+
#include <string.h>
86+
87+
typedef uint32_t khint_t;
88+
typedef khint_t khiter_t;
89+
90+
#define __ac_HASH_PRIME_SIZE 32
91+
static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
92+
{
93+
0ul, 3ul, 11ul, 23ul, 53ul,
94+
97ul, 193ul, 389ul, 769ul, 1543ul,
95+
3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
96+
98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
97+
3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
98+
100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
99+
3221225473ul, 4294967291ul
100+
};
101+
102+
#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
103+
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
104+
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
105+
#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
106+
#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
107+
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
108+
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
109+
110+
static const double __ac_HASH_UPPER = 0.77;
111+
112+
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
113+
typedef struct { \
114+
khint_t n_buckets, size, n_occupied, upper_bound; \
115+
uint32_t *flags; \
116+
khkey_t *keys; \
117+
khval_t *vals; \
118+
} kh_##name##_t; \
119+
static inline kh_##name##_t *kh_init_##name() { \
120+
return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \
121+
} \
122+
static inline void kh_destroy_##name(kh_##name##_t *h) \
123+
{ \
124+
if (h) { \
125+
free(h->keys); free(h->flags); \
126+
free(h->vals); \
127+
free(h); \
128+
} \
129+
} \
130+
static inline void kh_clear_##name(kh_##name##_t *h) \
131+
{ \
132+
if (h && h->flags) { \
133+
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \
134+
h->size = h->n_occupied = 0; \
135+
} \
136+
} \
137+
static inline khint_t kh_get_##name(kh_##name##_t *h, khkey_t key) \
138+
{ \
139+
if (h->n_buckets) { \
140+
khint_t inc, k, i, last; \
141+
k = __hash_func(key); i = k % h->n_buckets; \
142+
inc = 1 + k % (h->n_buckets - 1); last = i; \
143+
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
144+
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
145+
else i += inc; \
146+
if (i == last) return h->n_buckets; \
147+
} \
148+
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
149+
} else return 0; \
150+
} \
151+
static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
152+
{ \
153+
uint32_t *new_flags = 0; \
154+
khint_t j = 1; \
155+
{ \
156+
khint_t t = __ac_HASH_PRIME_SIZE - 1; \
157+
while (__ac_prime_list[t] > new_n_buckets) --t; \
158+
new_n_buckets = __ac_prime_list[t+1]; \
159+
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
160+
else { \
161+
new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
162+
memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
163+
if (h->n_buckets < new_n_buckets) { \
164+
h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
165+
if (kh_is_map) \
166+
h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
167+
} \
168+
} \
169+
} \
170+
if (j) { \
171+
for (j = 0; j != h->n_buckets; ++j) { \
172+
if (__ac_iseither(h->flags, j) == 0) { \
173+
khkey_t key = h->keys[j]; \
174+
khval_t val; \
175+
if (kh_is_map) val = h->vals[j]; \
176+
__ac_set_isdel_true(h->flags, j); \
177+
while (1) { \
178+
khint_t inc, k, i; \
179+
k = __hash_func(key); \
180+
i = k % new_n_buckets; \
181+
inc = 1 + k % (new_n_buckets - 1); \
182+
while (!__ac_isempty(new_flags, i)) { \
183+
if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
184+
else i += inc; \
185+
} \
186+
__ac_set_isempty_false(new_flags, i); \
187+
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
188+
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
189+
if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
190+
__ac_set_isdel_true(h->flags, i); \
191+
} else { \
192+
h->keys[i] = key; \
193+
if (kh_is_map) h->vals[i] = val; \
194+
break; \
195+
} \
196+
} \
197+
} \
198+
} \
199+
if (h->n_buckets > new_n_buckets) { \
200+
h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
201+
if (kh_is_map) \
202+
h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
203+
} \
204+
free(h->flags); \
205+
h->flags = new_flags; \
206+
h->n_buckets = new_n_buckets; \
207+
h->n_occupied = h->size; \
208+
h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
209+
} \
210+
} \
211+
static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
212+
{ \
213+
khint_t x; \
214+
if (h->n_occupied >= h->upper_bound) { \
215+
if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
216+
else kh_resize_##name(h, h->n_buckets + 1); \
217+
} \
218+
{ \
219+
khint_t inc, k, i, site, last; \
220+
x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \
221+
if (__ac_isempty(h->flags, i)) x = i; \
222+
else { \
223+
inc = 1 + k % (h->n_buckets - 1); last = i; \
224+
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
225+
if (__ac_isdel(h->flags, i)) site = i; \
226+
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
227+
else i += inc; \
228+
if (i == last) { x = site; break; } \
229+
} \
230+
if (x == h->n_buckets) { \
231+
if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
232+
else x = i; \
233+
} \
234+
} \
235+
} \
236+
if (__ac_isempty(h->flags, x)) { \
237+
h->keys[x] = key; \
238+
__ac_set_isboth_false(h->flags, x); \
239+
++h->size; ++h->n_occupied; \
240+
*ret = 1; \
241+
} else if (__ac_isdel(h->flags, x)) { \
242+
h->keys[x] = key; \
243+
__ac_set_isboth_false(h->flags, x); \
244+
++h->size; \
245+
*ret = 2; \
246+
} else *ret = 0; \
247+
return x; \
248+
} \
249+
static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \
250+
{ \
251+
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
252+
__ac_set_isdel_true(h->flags, x); \
253+
--h->size; \
254+
} \
255+
}
256+
257+
/* --- BEGIN OF HASH FUNCTIONS --- */
258+
259+
#define kh_int_hash_func(key) (uint32_t)(key)
260+
#define kh_int_hash_equal(a, b) (a == b)
261+
#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11)
262+
#define kh_int64_hash_equal(a, b) (a == b)
263+
static inline khint_t __ac_X31_hash_string(const char *s)
264+
{
265+
khint_t h = *s;
266+
if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
267+
return h;
268+
}
269+
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
270+
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
271+
272+
/* --- END OF HASH FUNCTIONS --- */
273+
274+
/* Other necessary macros... */
275+
276+
#define khash_t(name) kh_##name##_t
277+
278+
#define kh_init(name) kh_init_##name()
279+
#define kh_destroy(name, h) kh_destroy_##name(h)
280+
#define kh_clear(name, h) kh_clear_##name(h)
281+
#define kh_resize(name, h, s) kh_resize_##name(h, s)
282+
#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
283+
#define kh_get(name, h, k) kh_get_##name(h, k)
284+
#define kh_del(name, h, k) kh_del_##name(h, k)
285+
286+
#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
287+
#define kh_key(h, x) ((h)->keys[x])
288+
#define kh_val(h, x) ((h)->vals[x])
289+
#define kh_value(h, x) ((h)->vals[x])
290+
#define kh_begin(h) (khint_t)(0)
291+
#define kh_end(h) ((h)->n_buckets)
292+
#define kh_size(h) ((h)->size)
293+
#define kh_n_buckets(h) ((h)->n_buckets)
294+
295+
/* More conenient interfaces */
296+
297+
#define KHASH_SET_INIT_INT(name) \
298+
KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
299+
300+
#define KHASH_MAP_INIT_INT(name, khval_t) \
301+
KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
302+
303+
#define KHASH_SET_INIT_INT64(name) \
304+
KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
305+
306+
#define KHASH_MAP_INIT_INT64(name, khval_t) \
307+
KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
308+
309+
typedef const char *kh_cstr_t;
310+
#define KHASH_SET_INIT_STR(name) \
311+
KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
312+
313+
#define KHASH_MAP_INIT_STR(name, khval_t) \
314+
KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
315+
316+
#include <Python.h>
317+
318+
#define kh_python_hash_func(key) PyObject_Hash(key)
319+
#define kh_python_hash_equal(a, b) PyObject_RichCompareBool(a, b, Py_EQ)
320+
321+
#define KHASH_SET_INIT_PYOBJECT(name, khval_t) \
322+
KHASH_INIT(name, PyObject*, khval_t, 0, kh_python_hash_func, kh_python_hash_equal)
323+
324+
KHASH_MAP_INIT_PYOBJECT(python, uint32_t)
325+
326+
#endif /* __AC_KHASH_H */

pandas/src/util.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,4 @@ cdef inline set_value_at(ndarray arr, object loc, object value):
4646

4747
cdef inline int is_contiguous(ndarray arr):
4848
return cnp.PyArray_CHKFLAGS(arr, cnp.NPY_C_CONTIGUOUS)
49+

0 commit comments

Comments
 (0)