Skip to content

Commit 187573b

Browse files
author
Mike Graham
committed
Steal the algorithm used to combine hashes from tupleobject.c
1 parent 5f40950 commit 187573b

File tree

1 file changed

+37
-21
lines changed

1 file changed

+37
-21
lines changed

pandas/tools/hashing.py

+37-21
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
data hash pandas / numpy objects
33
"""
4+
import itertools
45

56
import numpy as np
67
from pandas import _hash, Series, factorize, Categorical, Index
@@ -13,6 +14,22 @@
1314
_default_hash_key = '0123456789123456'
1415

1516

17+
def _combine_hash_arrays(arrays, num_items):
18+
"Should be the same as CPython's tupleobject.c"
19+
first = next(arrays)
20+
arrays = itertools.chain([first], arrays)
21+
22+
mult = np.zeros_like(first) + np.uint64(1000003)
23+
out = np.zeros_like(first) + np.uint64(0x345678)
24+
for i, a in enumerate(arrays):
25+
inverse_i = num_items - i
26+
out ^= a
27+
out *= mult
28+
mult += np.uint64(82520 + inverse_i + inverse_i)
29+
assert i+1 == num_items, 'Fed in wrong num_items'
30+
out += np.uint64(97531)
31+
return out
32+
1633
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
1734
categorize=True):
1835
"""
@@ -41,10 +58,6 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
4158
if hash_key is None:
4259
hash_key = _default_hash_key
4360

44-
def adder(h, hashed_to_add):
45-
h = np.multiply(h, np.uint(3), h)
46-
return np.add(h, hashed_to_add, h)
47-
4861
if isinstance(obj, ABCIndexClass):
4962
h = hash_array(obj.values, encoding, hash_key,
5063
categorize).astype('uint64')
@@ -53,26 +66,29 @@ def adder(h, hashed_to_add):
5366
h = hash_array(obj.values, encoding, hash_key,
5467
categorize).astype('uint64')
5568
if index:
56-
h = adder(h, hash_pandas_object(obj.index,
57-
index=False,
58-
encoding=encoding,
59-
hash_key=hash_key,
60-
categorize=categorize).values)
69+
index_iter = (hash_pandas_object(obj.index,
70+
index=False,
71+
encoding=encoding,
72+
hash_key=hash_key,
73+
categorize=categorize).values
74+
for _ in [None])
75+
arrays = itertools.chain([h], index_iter)
76+
h = _combine_hash_arrays(arrays, 2)
77+
6178
h = Series(h, index=obj.index, dtype='uint64')
6279
elif isinstance(obj, ABCDataFrame):
63-
cols = obj.iteritems()
64-
first_series = next(cols)[1]
65-
h = hash_array(first_series.values, encoding,
66-
hash_key, categorize).astype('uint64')
67-
for _, col in cols:
68-
h = adder(h, hash_array(col.values, encoding, hash_key,
69-
categorize))
80+
hashes = (hash_array(series.values) for _, series in obj.iteritems())
81+
num_items = len(obj.columns)
7082
if index:
71-
h = adder(h, hash_pandas_object(obj.index,
72-
index=False,
73-
encoding=encoding,
74-
hash_key=hash_key,
75-
categorize=categorize).values)
83+
index_hash_generator = (hash_pandas_object(obj.index,
84+
index=False,
85+
encoding=encoding,
86+
hash_key=hash_key,
87+
categorize=categorize).values
88+
for _ in [None])
89+
num_items += 1
90+
hashes = itertools.chain(hashes, index_hash_generator)
91+
h = _combine_hash_arrays(hashes, num_items)
7692

7793
h = Series(h, index=obj.index, dtype='uint64')
7894
else:

0 commit comments

Comments
 (0)