Skip to content

Commit 00eda16

Browse files
Mike Grahamjreback
Mike Graham
authored andcommitted
wipSteal the algorithm used to combine hashes from tupleobject.c
1 parent d0568d9 commit 00eda16

File tree

2 files changed

+39
-25
lines changed

2 files changed

+39
-25
lines changed

pandas/tools/hashing.py

+37-24
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,34 @@
11
"""
22
data hash pandas / numpy objects
33
"""
4+
import itertools
45

56
import numpy as np
67
from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex
78
from pandas.lib import is_bool_array
89
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
910
from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
10-
is_datetime64_dtype, is_timedelta64_dtype,
11-
is_object_dtype)
11+
is_datetime64_dtype, is_timedelta64_dtype)
1212

1313
# 16 byte long hashing key
1414
_default_hash_key = '0123456789123456'
1515

1616

17+
def _combine_hash_arrays(arrays, num_items):
18+
first = next(arrays)
19+
arrays = itertools.chain([first], arrays)
20+
21+
mult = np.zeros_like(first) + np.uint64(1000003)
22+
out = np.zeros_like(first) + np.uint64(0x345678)
23+
for i, a in enumerate(arrays):
24+
inverse_i = num_items - i
25+
out = (out ^ a) * mult
26+
mult += np.uint64(82520 + inverse_i + inverse_i)
27+
assert i + 1 == num_items, 'Fed in wrong num_items'
28+
out += np.uint64(97531)
29+
return out
30+
31+
1732
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
1833
categorize=True):
1934
"""
@@ -42,10 +57,6 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
4257
if hash_key is None:
4358
hash_key = _default_hash_key
4459

45-
def adder(h, hashed_to_add):
46-
h = np.multiply(h, np.uint(3), h)
47-
return np.add(h, hashed_to_add, h)
48-
4960
if isinstance(obj, MultiIndex):
5061
return _hash_tuples(obj, encoding, hash_key)
5162

@@ -57,26 +68,28 @@ def adder(h, hashed_to_add):
5768
h = hash_array(obj.values, encoding, hash_key,
5869
categorize).astype('uint64')
5970
if index:
60-
h = adder(h, hash_pandas_object(obj.index,
61-
index=False,
62-
encoding=encoding,
63-
hash_key=hash_key,
64-
categorize=categorize).values)
71+
h = _combine_hash_arrays(iter([
72+
h,
73+
hash_pandas_object(obj.index,
74+
index=False,
75+
encoding=encoding,
76+
hash_key=hash_key,
77+
categorize=categorize).values]),
78+
2)
6579
h = Series(h, index=obj.index, dtype='uint64')
6680
elif isinstance(obj, ABCDataFrame):
67-
cols = obj.iteritems()
68-
first_series = next(cols)[1]
69-
h = hash_array(first_series.values, encoding,
70-
hash_key, categorize).astype('uint64')
71-
for _, col in cols:
72-
h = adder(h, hash_array(col.values, encoding, hash_key,
73-
categorize))
81+
hashes = (hash_array(series.values) for _, series in obj.iteritems())
82+
num_items = len(obj.columns)
7483
if index:
75-
h = adder(h, hash_pandas_object(obj.index,
76-
index=False,
77-
encoding=encoding,
78-
hash_key=hash_key,
79-
categorize=categorize).values)
84+
index_hash_generator = (hash_pandas_object(obj.index,
85+
index=False,
86+
encoding=encoding,
87+
hash_key=hash_key,
88+
categorize=categorize).values # noqa
89+
for _ in [None])
90+
num_items += 1
91+
hashes = itertools.chain(hashes, index_hash_generator)
92+
h = _combine_hash_arrays(hashes, num_items)
8093

8194
h = Series(h, index=obj.index, dtype='uint64')
8295
else:
@@ -103,7 +116,7 @@ def _hash_tuples(vals, encoding, hash_key):
103116
vals = MultiIndex.from_tuples(vals)
104117

105118
# efficiently turn us into a DataFrame and hash
106-
return hash_pandas_object(vals.to_dataframe(index=False),
119+
return hash_pandas_object(vals.to_frame(index=False),
107120
index=False, encoding=encoding,
108121
hash_key=hash_key, categorize=False)
109122

pandas/tools/tests/test_hashing.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ def test_hash_list_tuples(self):
6262
self.assert_numpy_array_equal(result, expected)
6363

6464
def test_multiindex_unique(self):
65-
mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)])
65+
mi = MultiIndex.from_tuples([(118, 472), (236, 118),
66+
(51, 204), (102, 51)])
6667
self.assertTrue(mi.is_unique)
6768
result = hash_pandas_object(mi)
6869
self.assertTrue(result.is_unique)

0 commit comments

Comments
 (0)