Skip to content

Commit 522c384

Browse files
pitroumrocklin
authored andcommitted
Faster array tokenize() (dask#2377)
* Faster array tokenize() Optionally uses fast hashing functions if available (the packages 'mmh3' or 'cityhash' need to be installed), falls back on SHA1. * Appease flake8 * Disable cityhash * Reenable cityhash if post-0.2.1 * Also catch OverflowError (for mmh3) * Add xxHash
1 parent 381a2f9 commit 522c384

File tree

5 files changed

+158
-7
lines changed

5 files changed

+158
-7
lines changed

continuous_integration/travis/install.sh

+4-1
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,12 @@ pip install -q \
7979
--upgrade --no-deps
8080

8181
pip install -q \
82+
cityhash \
8283
flake8 \
84+
mmh3 \
8385
pandas_datareader \
84-
pytest-xdist
86+
pytest-xdist \
87+
xxhash
8588

8689
# Install dask
8790
pip install -q --no-deps -e .[complete]

dask/base.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from .compatibility import bind_method, unicode, PY3
1717
from .context import _globals
1818
from .core import flatten
19+
from .hashing import hash_buffer_hex
1920
from .utils import Dispatch, ensure_dict
2021
from .sharedict import ShareDict
2122

@@ -387,15 +388,15 @@ def normalize_array(x):
387388
x.shape, x.strides, offset)
388389
if x.dtype.hasobject:
389390
try:
390-
data = md5('-'.join(x.flat).encode('utf-8')).hexdigest()
391+
data = hash_buffer_hex('-'.join(x.flat).encode('utf-8'))
391392
except TypeError:
392-
data = md5(b'-'.join([unicode(item).encode('utf-8') for item in
393-
x.flat])).hexdigest()
393+
data = hash_buffer_hex(b'-'.join([unicode(item).encode('utf-8') for item in
394+
x.flat]))
394395
else:
395396
try:
396-
data = md5(x.ravel().view('i1').data).hexdigest()
397+
data = hash_buffer_hex(x.ravel(order='K').view('i1'))
397398
except (BufferError, AttributeError, ValueError):
398-
data = md5(x.copy().ravel().view('i1').data).hexdigest()
399+
data = hash_buffer_hex(x.copy().ravel(order='K').view('i1'))
399400
return (data, x.dtype, x.shape, x.strides)
400401

401402
normalize_token.register(np.dtype, repr)

dask/hashing.py

+102
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
from __future__ import absolute_import, division, print_function
2+
3+
import binascii
4+
import hashlib
5+
import sys
6+
7+
8+
hashers = [] # In decreasing performance order
9+
10+
11+
# Timings on a largish array:
12+
# - CityHash is 2x faster than MurmurHash
13+
# - xxHash is slightly slower than CityHash
14+
# - MurmurHash is 8x faster than SHA1
15+
# - SHA1 is significantly faster than all other hashlib algorithms
16+
17+
try:
18+
import cityhash # `pip install cityhash`
19+
except ImportError:
20+
pass
21+
else:
22+
# CityHash disabled unless the reference leak in
23+
# https://github.com/escherba/python-cityhash/pull/16
24+
# is fixed.
25+
if cityhash.__version__ >= '0.2.2':
26+
def _hash_cityhash(buf):
27+
"""
28+
Produce a 16-bytes hash of *buf* using CityHash.
29+
"""
30+
h = cityhash.CityHash128(buf)
31+
if sys.version_info >= (3,):
32+
return h.to_bytes(16, 'little')
33+
else:
34+
return binascii.a2b_hex('%032x' % h)
35+
36+
hashers.append(_hash_cityhash)
37+
38+
try:
39+
import xxhash # `pip install xxhash`
40+
except ImportError:
41+
pass
42+
else:
43+
def _hash_xxhash(buf):
44+
"""
45+
Produce a 8-bytes hash of *buf* using xxHash.
46+
"""
47+
return xxhash.xxh64(buf).digest()
48+
49+
hashers.append(_hash_xxhash)
50+
51+
try:
52+
import mmh3 # `pip install mmh3`
53+
except ImportError:
54+
pass
55+
else:
56+
def _hash_murmurhash(buf):
57+
"""
58+
Produce a 16-bytes hash of *buf* using MurmurHash.
59+
"""
60+
return mmh3.hash_bytes(buf)
61+
62+
hashers.append(_hash_murmurhash)
63+
64+
65+
def _hash_sha1(buf):
66+
"""
67+
Produce a 20-bytes hash of *buf* using SHA1.
68+
"""
69+
return hashlib.sha1(buf).digest()
70+
71+
72+
hashers.append(_hash_sha1)
73+
74+
75+
def hash_buffer(buf, hasher=None):
76+
"""
77+
Hash a bytes-like (buffer-compatible) object. This function returns
78+
a good quality hash but is not cryptographically secure. The fastest
79+
available algorithm is selected. A fixed-length bytes object is returned.
80+
"""
81+
if hasher is not None:
82+
try:
83+
return hasher(buf)
84+
except (TypeError, OverflowError):
85+
# Some hash libraries may have overly-strict type checking,
86+
# not accepting all buffers
87+
pass
88+
for hasher in hashers:
89+
try:
90+
return hasher(buf)
91+
except (TypeError, OverflowError):
92+
pass
93+
raise TypeError("unsupported type for hashing: %s" % (type(buf),))
94+
95+
96+
def hash_buffer_hex(buf, hasher=None):
97+
"""
98+
Same as hash_buffer, but returns its result in hex-encoded form.
99+
"""
100+
h = hash_buffer(buf, hasher)
101+
s = binascii.b2a_hex(h)
102+
return s.decode() if sys.version_info >= (3,) else s

dask/tests/test_delayed.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ def __call__(self):
403403
assert f().compute() == 2
404404

405405

406-
def test_name_consitent_across_instances():
406+
def test_name_consistent_across_instances():
407407
func = delayed(identity, pure=True)
408408

409409
data = {'x': 1, 'y': 25, 'z': [1, 2, 3]}

dask/tests/test_hashing.py

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from __future__ import absolute_import, division, print_function
2+
3+
import pytest
4+
5+
from dask.hashing import hashers, hash_buffer, hash_buffer_hex
6+
7+
8+
np = pytest.importorskip('numpy')
9+
10+
buffers = [
11+
b'abc',
12+
bytearray(b'123'),
13+
memoryview(b'456'),
14+
np.array(42),
15+
np.ones((100, 100)),
16+
np.zeros((100, 100), dtype=[('a', 'i4'), ('b', 'i2')]),
17+
np.ones(10000, dtype=np.int8)[1:], # unaligned
18+
]
19+
20+
21+
@pytest.mark.parametrize('x', buffers)
22+
def test_hash_buffer(x):
23+
for hasher in [None] + hashers:
24+
h = hash_buffer(x, hasher=hasher)
25+
assert isinstance(h, bytes)
26+
assert 8 <= len(h) < 32
27+
assert h == hash_buffer(x, hasher=hasher)
28+
29+
30+
@pytest.mark.parametrize('x', buffers)
31+
def test_hash_buffer_hex(x):
32+
for hasher in [None] + hashers:
33+
h = hash_buffer_hex(x, hasher=hasher)
34+
assert isinstance(h, str)
35+
assert 16 <= len(h) < 64
36+
assert h == hash_buffer_hex(x, hasher=hasher)
37+
38+
39+
@pytest.mark.parametrize('hasher', hashers)
40+
def test_hashers(hasher):
41+
# Sanity check
42+
x = b'x'
43+
h = hasher(x)
44+
assert isinstance(h, bytes)
45+
assert 8 <= len(h) < 32

0 commit comments

Comments
 (0)