|
23 | 23 | #define kh_exist_int64(h, k) (kh_exist(h, k))
|
24 | 24 | #define kh_exist_int32(h, k) (kh_exist(h, k))
|
25 | 25 |
|
26 |
| -KHASH_MAP_INIT_STR(str, size_t) |
| 26 | +#include "xxhash/xxhash.h" |
| 27 | + |
| 28 | +/* |
| 29 | + * By default khash uses crappy x31 hash function which puts strings that |
| 30 | + * differ only in the last character into neighbouring buckets which is not |
| 31 | + * good given that quadratic probing tries small steps first. |
| 32 | + * |
| 33 | + * xxhash gives better bucket distribution and performance-wise is great for |
| 34 | + * long-ish strings, but it is a bit slower than x31 on the shortest ones |
| 35 | + * (turns out at length == 2 the difference is already negligible). |
| 36 | + * |
| 37 | + * Inlining will hinder merging in upstream releases, but 1-character strings |
| 38 | + * are a valid use case for pandas, so let's pre-calculate a vector of 256 |
| 39 | + * values to avoid calling two functions (strlen and XXH32) if there's only one |
| 40 | + * character to hash. |
| 41 | + * |
| 42 | + * This table was generated with the following code. Feel free to re-run it if |
| 43 | + * an update comes in: |
| 44 | +
|
| 45 | +#include <stdio.h> |
| 46 | +#include "xxhash.h" |
| 47 | +
|
| 48 | +int main(int argc, char *argv[]) |
| 49 | +{ |
| 50 | + printf("static khint_t XXH32_EMPTY_HASH = 0x%08x;\n", |
| 51 | + XXH32("", 0, 0xdeadbeef)); |
| 52 | + printf("static khint_t XXH32_ONECHAR_HASH[256] = {"); |
| 53 | + unsigned char s[2] = {0}; |
| 54 | + for (int i = 0; i < 256; ++i) { |
| 55 | + if (i % 8 == 0) { |
| 56 | + printf("\n "); |
| 57 | + } |
| 58 | + s[0] = i; |
| 59 | + printf("0x%08x", XXH32(s, 1, 0xdeadbeef)); |
| 60 | + if (i < 255) { |
| 61 | + printf(", "); |
| 62 | + } |
| 63 | + } |
| 64 | + printf("\n};\n"); |
| 65 | + return 0; |
| 66 | +} |
| 67 | +*/ |
| 68 | + |
| 69 | +static khint_t XXH32_EMPTY_HASH = 0xc372c6cb; |
| 70 | +static khint_t XXH32_ONECHAR_HASH[256] = { |
| 71 | + 0x39110451, 0xd3efa134, 0xea8d6dc4, 0xe59a066b, 0x89f3a4f5, 0xdcce5bc9, 0x44be0c3e, 0x96469248, |
| 72 | + 0x7885ddeb, 0x24417b24, 0xb77b30b2, 0xa83d21eb, 0x6f6ba52b, 0x7315bbe5, 0xce858701, 0x52299f26, |
| 73 | + 0x440ec810, 0xd02a934f, 0xf873d394, 0xd168a8e1, 0x31c30198, 0x37c3967b, 0xc1bdbdf8, 0x3ddaf3cc, |
| 74 | + 0xb7222f4a, 0x96625cdf, 0xabf92a2f, 0x69e97975, 0x55f24523, 0x6b1abaa0, 0xe5b033ab, 0x9e21842c, |
| 75 | + 0x3ac2a339, 0x827b0af2, 0xd7ea0f97, 0x72317ee6, 0xe6bd4439, 0xb0b183f1, 0xca90e5e0, 0x57960753, |
| 76 | + 0x6eefe374, 0xb9c9c5b5, 0x57396d1f, 0x6db79351, 0xab55c12d, 0x32229df4, 0xbfa3a164, 0x58f9f4ba, |
| 77 | + 0x5987c643, 0xffbfa961, 0x1080d4eb, 0xc5c3d846, 0x16a7fd8e, 0xed29fd3a, 0x8d78613d, 0xd088b720, |
| 78 | + 0x8d597f4c, 0x2df1ce8f, 0x79bc5215, 0x749d67c1, 0xa9ad300c, 0x60c6237d, 0xeeb080e7, 0xb74eef62, |
| 79 | + 0x6ddba2f2, 0x3d9f18cf, 0x0b6ad1bd, 0xc7a33d19, 0x3cb6352f, 0x872839f9, 0x259ced1e, 0x0f9d713b, |
| 80 | + 0x6816620f, 0x8d2c96a7, 0x377fb2f9, 0x2616b5b5, 0x9bae3a05, 0x8368a004, 0x3a67fd94, 0x312529c4, |
| 81 | + 0xc9238f87, 0x3e85e142, 0x973dedc6, 0xcbc3d4ba, 0xd2629b58, 0x2aae9a6d, 0x82ffc598, 0x4a8512b3, |
| 82 | + 0x51146ceb, 0x85ddc3f4, 0xa83b942f, 0x55769a32, 0xf7fa3fdf, 0xfbe35842, 0x342ff574, 0x848400a6, |
| 83 | + 0x92707153, 0x48cd58fd, 0xbdae4a11, 0x701bbadb, 0x4a5b37c4, 0x98770eeb, 0xfc1b98fc, 0x05dd6894, |
| 84 | + 0xd3ba005c, 0x453bc774, 0xfe186d14, 0xa25acde2, 0xcc738313, 0x1dbdefa7, 0x83ed6f1e, 0xf9d8e195, |
| 85 | + 0x5f10c546, 0xf22c5a0f, 0x31da5f5e, 0x5341c163, 0xabd3f750, 0x882e33d8, 0x4d8105cd, 0xc1f6f3d9, |
| 86 | + 0x347e1d5c, 0xdb06193c, 0x64841a53, 0x3991a6e6, 0x0abdd625, 0xedcf00f7, 0xa8e64229, 0x2fc9029b, |
| 87 | + 0x4fc5ca41, 0x1f5aaae5, 0x29bdda91, 0x55446dae, 0x1566ec40, 0x9ac8391e, 0xcd4d6ab1, 0x0f3807f6, |
| 88 | + 0xf3be6887, 0x9f4b88bd, 0x33c401df, 0xaa9df64f, 0xce5c70ac, 0x9ee55a87, 0x4cb91c84, 0x8c322b3d, |
| 89 | + 0x8e40fb24, 0x3af430fb, 0xeea567c2, 0xe80c7dc2, 0x6f619449, 0xe0ca8048, 0x984c626e, 0x50bf1281, |
| 90 | + 0x4895cbee, 0x5d016a96, 0xe58b8980, 0x3457ef7c, 0x2a24f819, 0x0641cc30, 0xbddc5f84, 0x03ce4656, |
| 91 | + 0xbcb73c9c, 0xcd29be82, 0x0930d945, 0xf3fc8e3c, 0xbed775cd, 0xd6668fae, 0x6876f949, 0xcf34fbd7, |
| 92 | + 0x0537d916, 0x7efd5f26, 0xb2d32520, 0x10d58995, 0x19d64e1c, 0xacae767c, 0xf23a4e7d, 0xdcb654fe, |
| 93 | + 0xe1ec9a9f, 0x3061302b, 0x453a0b7c, 0xe845436e, 0xb2b690df, 0x245c17b5, 0x756a9374, 0x470998f5, |
| 94 | + 0xe31a5f5b, 0x60dbad02, 0xf738299d, 0x0db8b11a, 0xd34cb801, 0xb2f3597d, 0xa627e466, 0xda4f9935, |
| 95 | + 0x5c58e1df, 0x4b5319d6, 0x48acc08f, 0xce18d68e, 0xeb995e7f, 0x11a07cba, 0x025127b2, 0xd1325331, |
| 96 | + 0x55d76240, 0x281bba14, 0xb9ac069d, 0x25e60bcc, 0xf077fbd3, 0xe460ece9, 0x725a9971, 0xa6b5c6b4, |
| 97 | + 0xe5f216a3, 0xbee80d71, 0x1a049114, 0x851012d4, 0xa6e175cc, 0x6ec98c95, 0x56a77202, 0x7e2ab05f, |
| 98 | + 0x4850279c, 0x1b009afe, 0xf71e36b6, 0x9cadc37a, 0x43a167da, 0x5d75b5f3, 0xc432215c, 0x93ff1905, |
| 99 | + 0x8764d057, 0xf44cd35d, 0x03d3a324, 0xd65a5047, 0xe872b4d8, 0x8dcb9a23, 0xfebf9113, 0x59701be9, |
| 100 | + 0xdf9f6090, 0xce9b2907, 0x664c6a5a, 0x81bfefc4, 0x13829979, 0xda98b6ab, 0x7b7e9ff0, 0x13c24005, |
| 101 | + 0xcee61b6b, 0x15737a85, 0xe2f95e48, 0xf2136570, 0xd1ccfdab, 0xa9adfb16, 0x1f7339a9, 0x83247f43, |
| 102 | + 0x68c6c8bf, 0x5046f6fc, 0x2d3dea84, 0x79a0be74, 0x39dd7eb3, 0x4d5cc636, 0xe4e1352d, 0xd1317a99 |
| 103 | +}; |
| 104 | + |
| 105 | +/* Seed value is chosen arbitrarily. */ |
| 106 | +static khint_t XXH32_SEED = 0xdeadbeef; |
| 107 | + |
| 108 | +static khint_t PANDAS_INLINE str_xxhash_hash_func(kh_cstr_t key) { |
| 109 | + if (!key[0]) { |
| 110 | + return XXH32_EMPTY_HASH; |
| 111 | + } |
| 112 | + if (!key[1]) { |
| 113 | + return XXH32_ONECHAR_HASH[(uint8_t)key[0]]; |
| 114 | + } |
| 115 | + return XXH32(key, strlen(key), XXH32_SEED); |
| 116 | +} |
| 117 | + |
| 118 | +KHASH_INIT(str, kh_cstr_t, size_t, 1, |
| 119 | + str_xxhash_hash_func, kh_str_hash_equal) |
| 120 | + |
27 | 121 | KHASH_MAP_INIT_INT(int32, size_t)
|
28 | 122 | KHASH_MAP_INIT_INT64(int64, size_t)
|
29 | 123 |
|
@@ -71,6 +165,7 @@ KHASH_SET_INIT_PYOBJECT(pyset)
|
71 | 165 | #define kh_exist_pymap(h, k) (kh_exist(h, k))
|
72 | 166 | #define kh_exist_pyset(h, k) (kh_exist(h, k))
|
73 | 167 |
|
74 |
| -KHASH_MAP_INIT_STR(strbox, kh_pyobject_t) |
| 168 | +KHASH_INIT(strbox, kh_cstr_t, kh_pyobject_t, 1, |
| 169 | + str_xxhash_hash_func, kh_str_hash_equal) |
75 | 170 |
|
76 | 171 | #endif /* _KLIB_KHASH_PYTHON_H_ */
|
0 commit comments