Skip to content

Commit 996867b

Browse files
committed
Change str hash function to xxhash
1 parent 33254bc commit 996867b

File tree

5 files changed

+1224
-6
lines changed

5 files changed

+1224
-6
lines changed

pandas/src/klib/khash_python.h

+97-2
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,101 @@
2323
#define kh_exist_int64(h, k) (kh_exist(h, k))
2424
#define kh_exist_int32(h, k) (kh_exist(h, k))
2525

26-
KHASH_MAP_INIT_STR(str, size_t)
26+
#include "xxhash/xxhash.h"
27+
28+
/*
29+
* By default khash uses crappy x31 hash function which puts strings that
30+
* differ only in the last character into neighbouring buckets which is not
31+
* good given that quadratic probing tries small steps first.
32+
*
33+
* xxhash gives better bucket distribution and performance-wise is great for
34+
* long-ish strings, but it is a bit slower than x31 on the shortest ones
35+
* (turns out at length == 2 the difference is already negligible).
36+
*
37+
* Inlining will hinder merging in upstream releases, but 1-character strings
38+
* are a valid use case for pandas, so let's pre-calculate a vector of 256
39+
* values to avoid calling two functions (strlen and XXH32) if there's only one
40+
* character to hash.
41+
*
42+
* This table was generated with the following code. Feel free to re-run it if
43+
* an update comes in:
44+
45+
#include <stdio.h>
46+
#include "xxhash.h"
47+
48+
int main(int argc, char *argv[])
49+
{
50+
printf("static khint_t XXH32_EMPTY_HASH = 0x%08x;\n",
51+
XXH32("", 0, 0xdeadbeef));
52+
printf("static khint_t XXH32_ONECHAR_HASH[256] = {");
53+
unsigned char s[2] = {0};
54+
for (int i = 0; i < 256; ++i) {
55+
if (i % 8 == 0) {
56+
printf("\n ");
57+
}
58+
s[0] = i;
59+
printf("0x%08x", XXH32(s, 1, 0xdeadbeef));
60+
if (i < 255) {
61+
printf(", ");
62+
}
63+
}
64+
printf("\n};\n");
65+
return 0;
66+
}
67+
*/
68+
69+
static khint_t XXH32_EMPTY_HASH = 0xc372c6cb;
70+
static khint_t XXH32_ONECHAR_HASH[256] = {
71+
0x39110451, 0xd3efa134, 0xea8d6dc4, 0xe59a066b, 0x89f3a4f5, 0xdcce5bc9, 0x44be0c3e, 0x96469248,
72+
0x7885ddeb, 0x24417b24, 0xb77b30b2, 0xa83d21eb, 0x6f6ba52b, 0x7315bbe5, 0xce858701, 0x52299f26,
73+
0x440ec810, 0xd02a934f, 0xf873d394, 0xd168a8e1, 0x31c30198, 0x37c3967b, 0xc1bdbdf8, 0x3ddaf3cc,
74+
0xb7222f4a, 0x96625cdf, 0xabf92a2f, 0x69e97975, 0x55f24523, 0x6b1abaa0, 0xe5b033ab, 0x9e21842c,
75+
0x3ac2a339, 0x827b0af2, 0xd7ea0f97, 0x72317ee6, 0xe6bd4439, 0xb0b183f1, 0xca90e5e0, 0x57960753,
76+
0x6eefe374, 0xb9c9c5b5, 0x57396d1f, 0x6db79351, 0xab55c12d, 0x32229df4, 0xbfa3a164, 0x58f9f4ba,
77+
0x5987c643, 0xffbfa961, 0x1080d4eb, 0xc5c3d846, 0x16a7fd8e, 0xed29fd3a, 0x8d78613d, 0xd088b720,
78+
0x8d597f4c, 0x2df1ce8f, 0x79bc5215, 0x749d67c1, 0xa9ad300c, 0x60c6237d, 0xeeb080e7, 0xb74eef62,
79+
0x6ddba2f2, 0x3d9f18cf, 0x0b6ad1bd, 0xc7a33d19, 0x3cb6352f, 0x872839f9, 0x259ced1e, 0x0f9d713b,
80+
0x6816620f, 0x8d2c96a7, 0x377fb2f9, 0x2616b5b5, 0x9bae3a05, 0x8368a004, 0x3a67fd94, 0x312529c4,
81+
0xc9238f87, 0x3e85e142, 0x973dedc6, 0xcbc3d4ba, 0xd2629b58, 0x2aae9a6d, 0x82ffc598, 0x4a8512b3,
82+
0x51146ceb, 0x85ddc3f4, 0xa83b942f, 0x55769a32, 0xf7fa3fdf, 0xfbe35842, 0x342ff574, 0x848400a6,
83+
0x92707153, 0x48cd58fd, 0xbdae4a11, 0x701bbadb, 0x4a5b37c4, 0x98770eeb, 0xfc1b98fc, 0x05dd6894,
84+
0xd3ba005c, 0x453bc774, 0xfe186d14, 0xa25acde2, 0xcc738313, 0x1dbdefa7, 0x83ed6f1e, 0xf9d8e195,
85+
0x5f10c546, 0xf22c5a0f, 0x31da5f5e, 0x5341c163, 0xabd3f750, 0x882e33d8, 0x4d8105cd, 0xc1f6f3d9,
86+
0x347e1d5c, 0xdb06193c, 0x64841a53, 0x3991a6e6, 0x0abdd625, 0xedcf00f7, 0xa8e64229, 0x2fc9029b,
87+
0x4fc5ca41, 0x1f5aaae5, 0x29bdda91, 0x55446dae, 0x1566ec40, 0x9ac8391e, 0xcd4d6ab1, 0x0f3807f6,
88+
0xf3be6887, 0x9f4b88bd, 0x33c401df, 0xaa9df64f, 0xce5c70ac, 0x9ee55a87, 0x4cb91c84, 0x8c322b3d,
89+
0x8e40fb24, 0x3af430fb, 0xeea567c2, 0xe80c7dc2, 0x6f619449, 0xe0ca8048, 0x984c626e, 0x50bf1281,
90+
0x4895cbee, 0x5d016a96, 0xe58b8980, 0x3457ef7c, 0x2a24f819, 0x0641cc30, 0xbddc5f84, 0x03ce4656,
91+
0xbcb73c9c, 0xcd29be82, 0x0930d945, 0xf3fc8e3c, 0xbed775cd, 0xd6668fae, 0x6876f949, 0xcf34fbd7,
92+
0x0537d916, 0x7efd5f26, 0xb2d32520, 0x10d58995, 0x19d64e1c, 0xacae767c, 0xf23a4e7d, 0xdcb654fe,
93+
0xe1ec9a9f, 0x3061302b, 0x453a0b7c, 0xe845436e, 0xb2b690df, 0x245c17b5, 0x756a9374, 0x470998f5,
94+
0xe31a5f5b, 0x60dbad02, 0xf738299d, 0x0db8b11a, 0xd34cb801, 0xb2f3597d, 0xa627e466, 0xda4f9935,
95+
0x5c58e1df, 0x4b5319d6, 0x48acc08f, 0xce18d68e, 0xeb995e7f, 0x11a07cba, 0x025127b2, 0xd1325331,
96+
0x55d76240, 0x281bba14, 0xb9ac069d, 0x25e60bcc, 0xf077fbd3, 0xe460ece9, 0x725a9971, 0xa6b5c6b4,
97+
0xe5f216a3, 0xbee80d71, 0x1a049114, 0x851012d4, 0xa6e175cc, 0x6ec98c95, 0x56a77202, 0x7e2ab05f,
98+
0x4850279c, 0x1b009afe, 0xf71e36b6, 0x9cadc37a, 0x43a167da, 0x5d75b5f3, 0xc432215c, 0x93ff1905,
99+
0x8764d057, 0xf44cd35d, 0x03d3a324, 0xd65a5047, 0xe872b4d8, 0x8dcb9a23, 0xfebf9113, 0x59701be9,
100+
0xdf9f6090, 0xce9b2907, 0x664c6a5a, 0x81bfefc4, 0x13829979, 0xda98b6ab, 0x7b7e9ff0, 0x13c24005,
101+
0xcee61b6b, 0x15737a85, 0xe2f95e48, 0xf2136570, 0xd1ccfdab, 0xa9adfb16, 0x1f7339a9, 0x83247f43,
102+
0x68c6c8bf, 0x5046f6fc, 0x2d3dea84, 0x79a0be74, 0x39dd7eb3, 0x4d5cc636, 0xe4e1352d, 0xd1317a99
103+
};
104+
105+
/* Seed value is chosen arbitrarily. */
106+
static khint_t XXH32_SEED = 0xdeadbeef;
107+
108+
static khint_t PANDAS_INLINE str_xxhash_hash_func(kh_cstr_t key) {
109+
if (!key[0]) {
110+
return XXH32_EMPTY_HASH;
111+
}
112+
if (!key[1]) {
113+
return XXH32_ONECHAR_HASH[(uint8_t)key[0]];
114+
}
115+
return XXH32(key, strlen(key), XXH32_SEED);
116+
}
117+
118+
KHASH_INIT(str, kh_cstr_t, size_t, 1,
119+
str_xxhash_hash_func, kh_str_hash_equal)
120+
27121
KHASH_MAP_INIT_INT(int32, size_t)
28122
KHASH_MAP_INIT_INT64(int64, size_t)
29123

@@ -71,6 +165,7 @@ KHASH_SET_INIT_PYOBJECT(pyset)
71165
#define kh_exist_pymap(h, k) (kh_exist(h, k))
72166
#define kh_exist_pyset(h, k) (kh_exist(h, k))
73167

74-
KHASH_MAP_INIT_STR(strbox, kh_pyobject_t)
168+
KHASH_INIT(strbox, kh_cstr_t, kh_pyobject_t, 1,
169+
str_xxhash_hash_func, kh_str_hash_equal)
75170

76171
#endif /* _KLIB_KHASH_PYTHON_H_ */

pandas/src/xxhash/LICENSE

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
xxHash Library
2+
Copyright (c) 2012-2014, Yann Collet
3+
All rights reserved.
4+
5+
Redistribution and use in source and binary forms, with or without modification,
6+
are permitted provided that the following conditions are met:
7+
8+
* Redistributions of source code must retain the above copyright notice, this
9+
list of conditions and the following disclaimer.
10+
11+
* Redistributions in binary form must reproduce the above copyright notice, this
12+
list of conditions and the following disclaimer in the documentation and/or
13+
other materials provided with the distribution.
14+
15+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
19+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
22+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

0 commit comments

Comments
 (0)