forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathhashing.pyx
191 lines (151 loc) · 4.83 KB
/
hashing.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# -*- coding: utf-8 -*-
# Translated from the reference implementation
# at https://github.com/veorq/SipHash
import cython
from cpython cimport PyBytes_Check, PyUnicode_Check
from libc.stdlib cimport malloc, free
import numpy as np
from numpy cimport uint8_t, uint32_t, uint64_t, import_array
import_array()
from util cimport is_nan
DEF cROUNDS = 2
DEF dROUNDS = 4
@cython.boundscheck(False)
def hash_object_array(object[:] arr, object key, object encoding='utf8'):
"""
Parameters
----------
arr : 1-d object ndarray of objects
key : hash key, must be 16 byte len encoded
encoding : encoding for key & arr, default to 'utf8'
Returns
-------
1-d uint64 ndarray of hashes
Notes
-----
allowed values must be strings, or nulls
mixed array types will raise TypeError
"""
cdef:
Py_ssize_t i, l, n
uint64_t[:] result
bytes data, k
uint8_t *kb
uint64_t *lens
char **vecs
char *cdata
object val
k = <bytes>key.encode(encoding)
kb = <uint8_t *>k
if len(k) != 16:
raise ValueError("key should be a 16-byte string encoded, "
"got {key} (len {klen})".format(key=k, klen=len(k)))
n = len(arr)
# create an array of bytes
vecs = <char **> malloc(n * sizeof(char *))
lens = <uint64_t*> malloc(n * sizeof(uint64_t))
cdef list datas = []
for i in range(n):
val = arr[i]
if PyBytes_Check(val):
data = <bytes>val
elif PyUnicode_Check(val):
data = <bytes>val.encode(encoding)
elif val is None or is_nan(val):
# null, stringify and encode
data = <bytes>str(val).encode(encoding)
else:
raise TypeError("{val} of type {typ} is not a valid type "
"for hashing, must be string or null"
.format(val=val, typ=type(val)))
l = len(data)
lens[i] = l
cdata = data
# keep the references alive thru the end of the
# function
datas.append(data)
vecs[i] = cdata
result = np.empty(n, dtype=np.uint64)
with nogil:
for i in range(n):
result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb)
free(vecs)
free(lens)
return result.base # .base to retrieve underlying np.ndarray
cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
return (x << b) | (x >> (64 - b))
cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil:
p[0] = <uint8_t>(v)
p[1] = <uint8_t>(v >> 8)
p[2] = <uint8_t>(v >> 16)
p[3] = <uint8_t>(v >> 24)
cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
return (<uint64_t>p[0] |
<uint64_t>p[1] << 8 |
<uint64_t>p[2] << 16 |
<uint64_t>p[3] << 24 |
<uint64_t>p[4] << 32 |
<uint64_t>p[5] << 40 |
<uint64_t>p[6] << 48 |
<uint64_t>p[7] << 56)
cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
uint64_t* v2, uint64_t* v3) nogil:
v0[0] += v1[0]
v1[0] = _rotl(v1[0], 13)
v1[0] ^= v0[0]
v0[0] = _rotl(v0[0], 32)
v2[0] += v3[0]
v3[0] = _rotl(v3[0], 16)
v3[0] ^= v2[0]
v0[0] += v3[0]
v3[0] = _rotl(v3[0], 21)
v3[0] ^= v0[0]
v2[0] += v1[0]
v1[0] = _rotl(v1[0], 17)
v1[0] ^= v2[0]
v2[0] = _rotl(v2[0], 32)
# TODO: This appears unused; remove?
cpdef uint64_t siphash(bytes data, bytes key) except? 0:
if len(key) != 16:
raise ValueError("key should be a 16-byte bytestring, "
"got {key} (len {klen})"
.format(key=key, klen=len(key)))
return low_level_siphash(data, len(data), key)
@cython.cdivision(True)
cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
uint8_t* key) nogil:
cdef uint64_t v0 = 0x736f6d6570736575ULL
cdef uint64_t v1 = 0x646f72616e646f6dULL
cdef uint64_t v2 = 0x6c7967656e657261ULL
cdef uint64_t v3 = 0x7465646279746573ULL
cdef uint64_t b
cdef uint64_t k0 = u8to64_le(key)
cdef uint64_t k1 = u8to64_le(key + 8)
cdef uint64_t m
cdef int i
cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
cdef int left = datalen & 7
cdef int left_byte
b = (<uint64_t>datalen) << 56
v3 ^= k1
v2 ^= k0
v1 ^= k1
v0 ^= k0
while (data != end):
m = u8to64_le(data)
v3 ^= m
for i in range(cROUNDS):
_sipround(&v0, &v1, &v2, &v3)
v0 ^= m
data += sizeof(uint64_t)
for i in range(left-1, -1, -1):
b |= (<uint64_t>data[i]) << (i * 8)
v3 ^= b
for i in range(cROUNDS):
_sipround(&v0, &v1, &v2, &v3)
v0 ^= b
v2 ^= 0xff
for i in range(dROUNDS):
_sipround(&v0, &v1, &v2, &v3)
b = v0 ^ v1 ^ v2 ^ v3
return b