forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhashing.pyx
192 lines (154 loc) · 4.74 KB
/
hashing.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# Translated from the reference implementation
# at https://github.com/veorq/SipHash
import cython
from libc.stdlib cimport malloc, free
import numpy as np
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t, import_array
import_array()
from pandas._libs.util cimport is_nan
DEF cROUNDS = 2
DEF dROUNDS = 4
@cython.boundscheck(False)
def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
"""
Parameters
----------
arr : 1-d object ndarray of objects
key : hash key, must be 16 byte len encoded
encoding : encoding for key & arr, default to 'utf8'
Returns
-------
1-d uint64 ndarray of hashes.
Raises
------
TypeError
If the array contains mixed types.
Notes
-----
Allowed values must be strings, or nulls
mixed array types will raise TypeError.
"""
cdef:
Py_ssize_t i, l, n
uint64_t[:] result
bytes data, k
uint8_t *kb
uint64_t *lens
char **vecs
char *cdata
object val
list datas = []
k = <bytes>key.encode(encoding)
kb = <uint8_t *>k
if len(k) != 16:
raise ValueError(
f"key should be a 16-byte string encoded, got {k} (len {len(k)})"
)
n = len(arr)
# create an array of bytes
vecs = <char **>malloc(n * sizeof(char *))
lens = <uint64_t*>malloc(n * sizeof(uint64_t))
for i in range(n):
val = arr[i]
if isinstance(val, bytes):
data = <bytes>val
elif isinstance(val, str):
data = <bytes>val.encode(encoding)
elif val is None or is_nan(val):
# null, stringify and encode
data = <bytes>str(val).encode(encoding)
elif isinstance(val, tuple):
# GH#28969 we could have a tuple, but need to ensure that
# the tuple entries are themselves hashable before converting
# to str
hash(val)
data = <bytes>str(val).encode(encoding)
else:
raise TypeError(
f"{val} of type {type(val)} is not a valid type for hashing, "
"must be string or null"
)
l = len(data)
lens[i] = l
cdata = data
# keep the references alive through the end of the
# function
datas.append(data)
vecs[i] = cdata
result = np.empty(n, dtype=np.uint64)
with nogil:
for i in range(n):
result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb)
free(vecs)
free(lens)
return result.base # .base to retrieve underlying np.ndarray
cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
return (x << b) | (x >> (64 - b))
cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil:
p[0] = <uint8_t>(v)
p[1] = <uint8_t>(v >> 8)
p[2] = <uint8_t>(v >> 16)
p[3] = <uint8_t>(v >> 24)
cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
return (<uint64_t>p[0] |
<uint64_t>p[1] << 8 |
<uint64_t>p[2] << 16 |
<uint64_t>p[3] << 24 |
<uint64_t>p[4] << 32 |
<uint64_t>p[5] << 40 |
<uint64_t>p[6] << 48 |
<uint64_t>p[7] << 56)
cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
uint64_t* v2, uint64_t* v3) nogil:
v0[0] += v1[0]
v1[0] = _rotl(v1[0], 13)
v1[0] ^= v0[0]
v0[0] = _rotl(v0[0], 32)
v2[0] += v3[0]
v3[0] = _rotl(v3[0], 16)
v3[0] ^= v2[0]
v0[0] += v3[0]
v3[0] = _rotl(v3[0], 21)
v3[0] ^= v0[0]
v2[0] += v1[0]
v1[0] = _rotl(v1[0], 17)
v1[0] ^= v2[0]
v2[0] = _rotl(v2[0], 32)
@cython.cdivision(True)
cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
uint8_t* key) nogil:
cdef uint64_t v0 = 0x736f6d6570736575ULL
cdef uint64_t v1 = 0x646f72616e646f6dULL
cdef uint64_t v2 = 0x6c7967656e657261ULL
cdef uint64_t v3 = 0x7465646279746573ULL
cdef uint64_t b
cdef uint64_t k0 = u8to64_le(key)
cdef uint64_t k1 = u8to64_le(key + 8)
cdef uint64_t m
cdef int i
cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
cdef int left = datalen & 7
cdef int left_byte
b = (<uint64_t>datalen) << 56
v3 ^= k1
v2 ^= k0
v1 ^= k1
v0 ^= k0
while (data != end):
m = u8to64_le(data)
v3 ^= m
for i in range(cROUNDS):
_sipround(&v0, &v1, &v2, &v3)
v0 ^= m
data += sizeof(uint64_t)
for i in range(left-1, -1, -1):
b |= (<uint64_t>data[i]) << (i * 8)
v3 ^= b
for i in range(cROUNDS):
_sipround(&v0, &v1, &v2, &v3)
v0 ^= b
v2 ^= 0xff
for i in range(dROUNDS):
_sipround(&v0, &v1, &v2, &v3)
b = v0 ^ v1 ^ v2 ^ v3
return b