Skip to content

Commit 0c13df7

Browse files
Mike Grahamjreback
Mike Graham
authored andcommitted
Steal the algorithm used to combine hashes from tupleobject.c
1 parent e8dd607 commit 0c13df7

File tree

1 file changed

+13
-9
lines changed

1 file changed

+13
-9
lines changed

pandas/tools/hashing.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,16 @@
1616

1717

1818
def _combine_hash_arrays(arrays, num_items):
19+
"Should be the same as CPython's tupleobject.c"
1920
first = next(arrays)
2021
arrays = itertools.chain([first], arrays)
2122

2223
mult = np.zeros_like(first) + np.uint64(1000003)
2324
out = np.zeros_like(first) + np.uint64(0x345678)
2425
for i, a in enumerate(arrays):
2526
inverse_i = num_items - i
26-
out = (out ^ a) * mult
27+
out ^= a
28+
out *= mult
2729
mult += np.uint64(82520 + inverse_i + inverse_i)
2830
assert i + 1 == num_items, 'Fed in wrong num_items'
2931
out += np.uint64(97531)
@@ -70,15 +72,17 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
7072
h = hash_array(obj.values, encoding, hash_key,
7173
categorize).astype('uint64', copy=False)
7274
if index:
73-
h = _combine_hash_arrays(iter([
74-
h,
75-
hash_pandas_object(obj.index,
76-
index=False,
77-
encoding=encoding,
78-
hash_key=hash_key,
79-
categorize=categorize).values]),
80-
2)
75+
index_iter = (hash_pandas_object(obj.index,
76+
index=False,
77+
encoding=encoding,
78+
hash_key=hash_key,
79+
categorize=categorize).values
80+
for _ in [None])
81+
arrays = itertools.chain([h], index_iter)
82+
h = _combine_hash_arrays(arrays, 2)
83+
8184
h = Series(h, index=obj.index, dtype='uint64', copy=False)
85+
8286
elif isinstance(obj, ABCDataFrame):
8387
hashes = (hash_array(series.values) for _, series in obj.iteritems())
8488
num_items = len(obj.columns)

0 commit comments

Comments
 (0)