Skip to content

Commit 7117b6b

Browse files
author
Mike Graham
committed
Steal the algorithm used to combine hashes from tupleobject.c
1 parent 5f40950 commit 7117b6b

File tree

1 file changed

+34
-21
lines changed

1 file changed

+34
-21
lines changed

pandas/tools/hashing.py

+34-21
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
data hash pandas / numpy objects
33
"""
4+
import itertools
45

56
import numpy as np
67
from pandas import _hash, Series, factorize, Categorical, Index
@@ -13,6 +14,20 @@
1314
_default_hash_key = '0123456789123456'
1415

1516

17+
def _combine_hash_arrays(arrays, num_items):
18+
first = next(arrays)
19+
arrays = itertools.chain([first], arrays)
20+
21+
mult = np.zeros_like(first) + np.uint64(1000003)
22+
out = np.zeros_like(first) + np.uint64(0x345678)
23+
for i, a in enumerate(arrays):
24+
inverse_i = num_items - i
25+
out = (out ^ a) * mult
26+
mult += np.uint64(82520 + inverse_i + inverse_i)
27+
assert i+1 == num_items, 'Fed in wrong num_items'
28+
out += np.uint64(97531)
29+
return out
30+
1631
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
1732
categorize=True):
1833
"""
@@ -41,10 +56,6 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
4156
if hash_key is None:
4257
hash_key = _default_hash_key
4358

44-
def adder(h, hashed_to_add):
45-
h = np.multiply(h, np.uint(3), h)
46-
return np.add(h, hashed_to_add, h)
47-
4859
if isinstance(obj, ABCIndexClass):
4960
h = hash_array(obj.values, encoding, hash_key,
5061
categorize).astype('uint64')
@@ -53,26 +64,28 @@ def adder(h, hashed_to_add):
5364
h = hash_array(obj.values, encoding, hash_key,
5465
categorize).astype('uint64')
5566
if index:
56-
h = adder(h, hash_pandas_object(obj.index,
57-
index=False,
58-
encoding=encoding,
59-
hash_key=hash_key,
60-
categorize=categorize).values)
67+
h = _combine_hash_arrays(iter([
68+
h,
69+
hash_pandas_object(obj.index,
70+
index=False,
71+
encoding=encoding,
72+
hash_key=hash_key,
73+
categorize=categorize).values]),
74+
2)
6175
h = Series(h, index=obj.index, dtype='uint64')
6276
elif isinstance(obj, ABCDataFrame):
63-
cols = obj.iteritems()
64-
first_series = next(cols)[1]
65-
h = hash_array(first_series.values, encoding,
66-
hash_key, categorize).astype('uint64')
67-
for _, col in cols:
68-
h = adder(h, hash_array(col.values, encoding, hash_key,
69-
categorize))
77+
hashes = (hash_array(series.values) for _, series in obj.iteritems())
78+
num_items = len(obj.columns)
7079
if index:
71-
h = adder(h, hash_pandas_object(obj.index,
72-
index=False,
73-
encoding=encoding,
74-
hash_key=hash_key,
75-
categorize=categorize).values)
80+
index_hash_generator = (hash_pandas_object(obj.index,
81+
index=False,
82+
encoding=encoding,
83+
hash_key=hash_key,
84+
categorize=categorize).values
85+
for _ in [None])
86+
num_items += 1
87+
hashes = itertools.chain(hashes, index_hash_generator)
88+
h = _combine_hash_arrays(hashes, num_items)
7689

7790
h = Series(h, index=obj.index, dtype='uint64')
7891
else:

0 commit comments

Comments
 (0)