Skip to content

Commit 06f26b5

Browse files
authored
ENH: add data hashing routines (pandas-dev#14729)
xref dask/dask#1807
1 parent c5f219a commit 06f26b5

File tree

5 files changed

+498
-2
lines changed

5 files changed

+498
-2
lines changed

asv_bench/benchmarks/algorithms.py

+33
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import numpy as np
22
import pandas as pd
3+
from pandas.util import testing as tm
34

45

56
class algorithm(object):
@@ -55,3 +56,35 @@ def time_add_overflow_neg_arr(self):
5556

5657
def time_add_overflow_mixed_arr(self):
5758
self.checked_add(self.arr, self.arrmixed)
59+
60+
61+
class hashing(object):
62+
goal_time = 0.2
63+
64+
def setup(self):
65+
N = 100000
66+
67+
self.df = pd.DataFrame(
68+
{'A': pd.Series(tm.makeStringIndex(100).take(
69+
np.random.randint(0, 100, size=N))),
70+
'B': pd.Series(tm.makeStringIndex(10000).take(
71+
np.random.randint(0, 10000, size=N))),
72+
'D': np.random.randn(N),
73+
'E': np.arange(N),
74+
'F': pd.date_range('20110101', freq='s', periods=N),
75+
'G': pd.timedelta_range('1 day', freq='s', periods=N),
76+
})
77+
self.df['C'] = self.df['B'].astype('category')
78+
self.df.iloc[10:20] = np.nan
79+
80+
def time_frame(self):
81+
self.df.hash()
82+
83+
def time_series_int(self):
84+
self.df.E.hash()
85+
86+
def time_series_string(self):
87+
self.df.B.hash()
88+
89+
def time_series_categorical(self):
90+
self.df.C.hash()

pandas/src/hash.pyx

+180
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
# cython: profile=False
2+
# Translated from the reference implementation
3+
# at https://github.com/veorq/SipHash
4+
5+
import cython
6+
cimport numpy as cnp
7+
import numpy as np
8+
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t
9+
10+
from cpython cimport (PyString_Check,
11+
PyBytes_Check,
12+
PyUnicode_Check)
13+
from libc.stdlib cimport malloc, free
14+
15+
DEF cROUNDS = 2
16+
DEF dROUNDS = 4
17+
18+
19+
@cython.boundscheck(False)
20+
def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
21+
"""
22+
Parameters
23+
----------
24+
arr : 1-d object ndarray of objects
25+
key : hash key, must be 16 byte len encoded
26+
encoding : encoding for key & arr, default to 'utf8'
27+
28+
Returns
29+
-------
30+
1-d uint64 ndarray of hashes
31+
32+
"""
33+
cdef:
34+
Py_ssize_t i, l, n
35+
ndarray[uint64_t] result
36+
bytes data, k
37+
uint8_t *kb, *lens
38+
char **vecs, *cdata
39+
object val
40+
41+
k = <bytes>key.encode(encoding)
42+
kb = <uint8_t *>k
43+
if len(k) != 16:
44+
raise ValueError(
45+
'key should be a 16-byte string encoded, got {!r} (len {})'.format(
46+
k, len(k)))
47+
48+
n = len(arr)
49+
50+
# create an array of bytes
51+
vecs = <char **> malloc(n * sizeof(char *))
52+
lens = <uint8_t*> malloc(n * sizeof(uint8_t))
53+
54+
cdef list datas = []
55+
for i in range(n):
56+
val = arr[i]
57+
if PyString_Check(val):
58+
data = <bytes>val.encode(encoding)
59+
elif PyBytes_Check(val):
60+
data = <bytes>val
61+
elif PyUnicode_Check(val):
62+
data = <bytes>val.encode(encoding)
63+
else:
64+
# non-strings
65+
data = <bytes>str(val).encode(encoding)
66+
67+
l = len(data)
68+
lens[i] = l
69+
cdata = data
70+
71+
# keep the refernce alive thru the end of the
72+
# function
73+
datas.append(data)
74+
vecs[i] = cdata
75+
76+
result = np.empty(n, dtype=np.uint64)
77+
with nogil:
78+
for i in range(n):
79+
result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb)
80+
81+
free(vecs)
82+
free(lens)
83+
return result
84+
85+
cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
86+
return (x << b) | (x >> (64 - b))
87+
88+
cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil:
89+
p[0] = <uint8_t>(v)
90+
p[1] = <uint8_t>(v >> 8)
91+
p[2] = <uint8_t>(v >> 16)
92+
p[3] = <uint8_t>(v >> 24)
93+
94+
cdef inline void u64to8_le(uint8_t* p, uint64_t v) nogil:
95+
u32to8_le(p, <uint32_t>v)
96+
u32to8_le(p + 4, <uint32_t>(v >> 32))
97+
98+
cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
99+
return (<uint64_t>p[0] |
100+
<uint64_t>p[1] << 8 |
101+
<uint64_t>p[2] << 16 |
102+
<uint64_t>p[3] << 24 |
103+
<uint64_t>p[4] << 32 |
104+
<uint64_t>p[5] << 40 |
105+
<uint64_t>p[6] << 48 |
106+
<uint64_t>p[7] << 56)
107+
108+
cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
109+
uint64_t* v2, uint64_t* v3) nogil:
110+
v0[0] += v1[0]
111+
v1[0] = _rotl(v1[0], 13)
112+
v1[0] ^= v0[0]
113+
v0[0] = _rotl(v0[0], 32)
114+
v2[0] += v3[0]
115+
v3[0] = _rotl(v3[0], 16)
116+
v3[0] ^= v2[0]
117+
v0[0] += v3[0]
118+
v3[0] = _rotl(v3[0], 21)
119+
v3[0] ^= v0[0]
120+
v2[0] += v1[0]
121+
v1[0] = _rotl(v1[0], 17)
122+
v1[0] ^= v2[0]
123+
v2[0] = _rotl(v2[0], 32)
124+
125+
cpdef uint64_t siphash(bytes data, bytes key) except? 0:
126+
if len(key) != 16:
127+
raise ValueError(
128+
'key should be a 16-byte bytestring, got {!r} (len {})'.format(
129+
key, len(key)))
130+
return low_level_siphash(data, len(data), key)
131+
132+
133+
@cython.cdivision(True)
134+
cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
135+
uint8_t* key) nogil:
136+
cdef uint64_t v0 = 0x736f6d6570736575ULL
137+
cdef uint64_t v1 = 0x646f72616e646f6dULL
138+
cdef uint64_t v2 = 0x6c7967656e657261ULL
139+
cdef uint64_t v3 = 0x7465646279746573ULL
140+
cdef uint64_t b
141+
cdef uint64_t k0 = u8to64_le(key)
142+
cdef uint64_t k1 = u8to64_le(key + 8)
143+
cdef uint64_t m
144+
cdef int i
145+
cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
146+
cdef int left = datalen & 7
147+
cdef int left_byte
148+
149+
b = (<uint64_t>datalen) << 56
150+
v3 ^= k1
151+
v2 ^= k0
152+
v1 ^= k1
153+
v0 ^= k0
154+
155+
while (data != end):
156+
m = u8to64_le(data)
157+
v3 ^= m
158+
for i in range(cROUNDS):
159+
_sipround(&v0, &v1, &v2, &v3)
160+
v0 ^= m
161+
162+
data += sizeof(uint64_t)
163+
164+
for i in range(left-1, -1, -1):
165+
b |= (<uint64_t>data[i]) << (i * 8)
166+
167+
v3 ^= b
168+
169+
for i in range(cROUNDS):
170+
_sipround(&v0, &v1, &v2, &v3)
171+
172+
v0 ^= b
173+
v2 ^= 0xff
174+
175+
for i in range(dROUNDS):
176+
_sipround(&v0, &v1, &v2, &v3)
177+
178+
b = v0 ^ v1 ^ v2 ^ v3
179+
180+
return b

pandas/tools/hashing.py

+137
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
"""
2+
data hash pandas / numpy objects
3+
"""
4+
5+
import numpy as np
6+
from pandas import _hash, Series, factorize, Categorical, Index
7+
from pandas.lib import infer_dtype
8+
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
9+
from pandas.types.common import is_categorical_dtype
10+
11+
# 16 byte long hashing key
12+
_default_hash_key = '0123456789123456'
13+
14+
15+
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
16+
"""
17+
Return a data hash of the Index/Series/DataFrame
18+
19+
.. versionadded:: 0.19.2
20+
21+
Parameters
22+
----------
23+
index : boolean, default True
24+
include the index in the hash (if Series/DataFrame)
25+
encoding : string, default 'utf8'
26+
encoding for data & key when strings
27+
hash_key : string key to encode, default to _default_hash_key
28+
29+
Returns
30+
-------
31+
Series of uint64, same length as the object
32+
33+
"""
34+
if hash_key is None:
35+
hash_key = _default_hash_key
36+
37+
def adder(h, hashed_to_add):
38+
h = np.multiply(h, np.uint(3), h)
39+
return np.add(h, hashed_to_add, h)
40+
41+
if isinstance(obj, ABCIndexClass):
42+
h = hash_array(obj.values, encoding, hash_key).astype('uint64')
43+
h = Series(h, index=obj, dtype='uint64')
44+
elif isinstance(obj, ABCSeries):
45+
h = hash_array(obj.values, encoding, hash_key).astype('uint64')
46+
if index:
47+
h = adder(h, hash_pandas_object(obj.index,
48+
index=False,
49+
encoding=encoding,
50+
hash_key=hash_key).values)
51+
h = Series(h, index=obj.index, dtype='uint64')
52+
elif isinstance(obj, ABCDataFrame):
53+
cols = obj.iteritems()
54+
first_series = next(cols)[1]
55+
h = hash_array(first_series.values, encoding,
56+
hash_key).astype('uint64')
57+
for _, col in cols:
58+
h = adder(h, hash_array(col.values, encoding, hash_key))
59+
if index:
60+
h = adder(h, hash_pandas_object(obj.index,
61+
index=False,
62+
encoding=encoding,
63+
hash_key=hash_key).values)
64+
65+
h = Series(h, index=obj.index, dtype='uint64')
66+
else:
67+
raise TypeError("Unexpected type for hashing %s" % type(obj))
68+
return h
69+
70+
71+
def hash_array(vals, encoding='utf8', hash_key=None):
72+
"""
73+
Given a 1d array, return an array of deterministic integers.
74+
75+
.. versionadded:: 0.19.2
76+
77+
Parameters
78+
----------
79+
vals : ndarray
80+
encoding : string, default 'utf8'
81+
encoding for data & key when strings
82+
hash_key : string key to encode, default to _default_hash_key
83+
84+
Returns
85+
-------
86+
1d uint64 numpy array of hash values, same length as the vals
87+
88+
"""
89+
90+
# work with cagegoricals as ints. (This check is above the complex
91+
# check so that we don't ask numpy if categorical is a subdtype of
92+
# complex, as it will choke.
93+
if hash_key is None:
94+
hash_key = _default_hash_key
95+
96+
if is_categorical_dtype(vals.dtype):
97+
vals = vals.codes
98+
99+
# we'll be working with everything as 64-bit values, so handle this
100+
# 128-bit value early
101+
if np.issubdtype(vals.dtype, np.complex128):
102+
return hash_array(vals.real) + 23 * hash_array(vals.imag)
103+
104+
# MAIN LOGIC:
105+
inferred = infer_dtype(vals)
106+
107+
# First, turn whatever array this is into unsigned 64-bit ints, if we can
108+
# manage it.
109+
if inferred == 'boolean':
110+
vals = vals.astype('u8')
111+
112+
if (np.issubdtype(vals.dtype, np.datetime64) or
113+
np.issubdtype(vals.dtype, np.timedelta64) or
114+
np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
115+
116+
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
117+
else:
118+
119+
# its MUCH faster to categorize object dtypes, then hash and rename
120+
codes, categories = factorize(vals, sort=False)
121+
categories = Index(categories)
122+
c = Series(Categorical(codes, categories,
123+
ordered=False, fastpath=True))
124+
vals = _hash.hash_object_array(categories.values,
125+
hash_key,
126+
encoding)
127+
128+
# rename & extract
129+
vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values
130+
131+
# Then, redistribute these 64-bit ints within the space of 64-bit ints
132+
vals ^= vals >> 30
133+
vals *= np.uint64(0xbf58476d1ce4e5b9)
134+
vals ^= vals >> 27
135+
vals *= np.uint64(0x94d049bb133111eb)
136+
vals ^= vals >> 31
137+
return vals

0 commit comments

Comments
 (0)