Skip to content

Commit c67486f

Browse files
committed
ENH: support MultiIndex and tuple hashing
closes #15227 Author: Jeff Reback <[email protected]> Author: Mike Graham <mikegraham2gmail.com> Closes #15224 from jreback/mi_hash2 and squashes the following commits: 8b1d3f9 [Jeff Reback] not correctly hashing categorical in a MI 48a2402 [Jeff Reback] support for mixed type arrays 58f682d [Jeff Reback] memory optimization 0c13df7 [Mike Graham] Steal the algorithm used to combine hashes from tupleobject.c e8dd607 [Jeff Reback] add hash_tuples 44e9c7d [Mike Graham] wipSteal the algorithm used to combine hashes from tupleobject.c e507c4a [Jeff Reback] ENH: support MultiIndex and tuple hashing
1 parent be32852 commit c67486f

File tree

3 files changed

+178
-55
lines changed

3 files changed

+178
-55
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ Other enhancements
145145
- ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`)
146146
- ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`).
147147
- ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`).
148+
- ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`)
148149

149150
.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
150151

pandas/tools/hashing.py

+125-33
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,49 @@
11
"""
22
data hash pandas / numpy objects
33
"""
4+
import itertools
45

56
import numpy as np
6-
from pandas import _hash, Series, factorize, Categorical, Index
7+
from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex
8+
import pandas.core.algorithms as algos
79
from pandas.lib import is_bool_array
810
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
911
from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
10-
is_datetime64_dtype, is_timedelta64_dtype)
12+
is_datetime64_dtype, is_timedelta64_dtype,
13+
is_list_like)
1114

1215
# 16 byte long hashing key
1316
_default_hash_key = '0123456789123456'
1417

1518

19+
def _combine_hash_arrays(arrays, num_items):
20+
"""
21+
Parameters
22+
----------
23+
arrays : generator
24+
num_items : int
25+
26+
Should be the same as CPython's tupleobject.c
27+
"""
28+
try:
29+
first = next(arrays)
30+
except StopIteration:
31+
return np.array([], dtype=np.uint64)
32+
33+
arrays = itertools.chain([first], arrays)
34+
35+
mult = np.uint64(1000003)
36+
out = np.zeros_like(first) + np.uint64(0x345678)
37+
for i, a in enumerate(arrays):
38+
inverse_i = num_items - i
39+
out ^= a
40+
out *= mult
41+
mult += np.uint64(82520 + inverse_i + inverse_i)
42+
assert i + 1 == num_items, 'Fed in wrong num_items'
43+
out += np.uint64(97531)
44+
return out
45+
46+
1647
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
1748
categorize=True):
1849
"""
@@ -41,45 +72,97 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
4172
if hash_key is None:
4273
hash_key = _default_hash_key
4374

44-
def adder(h, hashed_to_add):
45-
h = np.multiply(h, np.uint(3), h)
46-
return np.add(h, hashed_to_add, h)
75+
if isinstance(obj, MultiIndex):
76+
return Series(hash_tuples(obj, encoding, hash_key),
77+
dtype='uint64', copy=False)
4778

4879
if isinstance(obj, ABCIndexClass):
4980
h = hash_array(obj.values, encoding, hash_key,
50-
categorize).astype('uint64')
51-
h = Series(h, index=obj, dtype='uint64')
81+
categorize).astype('uint64', copy=False)
82+
h = Series(h, index=obj, dtype='uint64', copy=False)
5283
elif isinstance(obj, ABCSeries):
5384
h = hash_array(obj.values, encoding, hash_key,
54-
categorize).astype('uint64')
85+
categorize).astype('uint64', copy=False)
5586
if index:
56-
h = adder(h, hash_pandas_object(obj.index,
57-
index=False,
58-
encoding=encoding,
59-
hash_key=hash_key,
60-
categorize=categorize).values)
61-
h = Series(h, index=obj.index, dtype='uint64')
87+
index_iter = (hash_pandas_object(obj.index,
88+
index=False,
89+
encoding=encoding,
90+
hash_key=hash_key,
91+
categorize=categorize).values
92+
for _ in [None])
93+
arrays = itertools.chain([h], index_iter)
94+
h = _combine_hash_arrays(arrays, 2)
95+
96+
h = Series(h, index=obj.index, dtype='uint64', copy=False)
97+
6298
elif isinstance(obj, ABCDataFrame):
63-
cols = obj.iteritems()
64-
first_series = next(cols)[1]
65-
h = hash_array(first_series.values, encoding,
66-
hash_key, categorize).astype('uint64')
67-
for _, col in cols:
68-
h = adder(h, hash_array(col.values, encoding, hash_key,
69-
categorize))
99+
hashes = (hash_array(series.values) for _, series in obj.iteritems())
100+
num_items = len(obj.columns)
70101
if index:
71-
h = adder(h, hash_pandas_object(obj.index,
72-
index=False,
73-
encoding=encoding,
74-
hash_key=hash_key,
75-
categorize=categorize).values)
102+
index_hash_generator = (hash_pandas_object(obj.index,
103+
index=False,
104+
encoding=encoding,
105+
hash_key=hash_key,
106+
categorize=categorize).values # noqa
107+
for _ in [None])
108+
num_items += 1
109+
hashes = itertools.chain(hashes, index_hash_generator)
110+
h = _combine_hash_arrays(hashes, num_items)
76111

77-
h = Series(h, index=obj.index, dtype='uint64')
112+
h = Series(h, index=obj.index, dtype='uint64', copy=False)
78113
else:
79114
raise TypeError("Unexpected type for hashing %s" % type(obj))
80115
return h
81116

82117

118+
def hash_tuples(vals, encoding='utf8', hash_key=None):
119+
"""
120+
Hash an MultiIndex / list-of-tuples efficiently
121+
122+
.. versionadded:: 0.20.0
123+
124+
Parameters
125+
----------
126+
vals : MultiIndex, list-of-tuples, or single tuple
127+
encoding : string, default 'utf8'
128+
hash_key : string key to encode, default to _default_hash_key
129+
130+
Returns
131+
-------
132+
ndarray of hashed values array
133+
"""
134+
135+
is_tuple = False
136+
if isinstance(vals, tuple):
137+
vals = [vals]
138+
is_tuple = True
139+
elif not is_list_like(vals):
140+
raise TypeError("must be convertible to a list-of-tuples")
141+
142+
if not isinstance(vals, MultiIndex):
143+
vals = MultiIndex.from_tuples(vals)
144+
145+
# create a list-of-ndarrays
146+
def get_level_values(num):
147+
unique = vals.levels[num] # .values
148+
labels = vals.labels[num]
149+
filled = algos.take_1d(unique._values, labels,
150+
fill_value=unique._na_value)
151+
return filled
152+
153+
vals = [get_level_values(level)
154+
for level in range(vals.nlevels)]
155+
156+
# hash the list-of-ndarrays
157+
hashes = (hash_array(l, encoding=encoding, hash_key=hash_key)
158+
for l in vals)
159+
h = _combine_hash_arrays(hashes, len(vals))
160+
if is_tuple:
161+
h = h[0]
162+
163+
return h
164+
165+
83166
def _hash_categorical(c, encoding, hash_key):
84167
"""
85168
Hash a Categorical by hashing its categories, and then mapping the codes
@@ -97,7 +180,7 @@ def _hash_categorical(c, encoding, hash_key):
97180
"""
98181
cat_hashed = hash_array(c.categories.values, encoding, hash_key,
99182
categorize=False).astype(np.uint64, copy=False)
100-
return c.rename_categories(cat_hashed).astype(np.uint64)
183+
return c.rename_categories(cat_hashed).astype(np.uint64, copy=False)
101184

102185

103186
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
@@ -108,7 +191,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
108191
109192
Parameters
110193
----------
111-
vals : ndarray
194+
vals : ndarray, Categorical
112195
encoding : string, default 'utf8'
113196
encoding for data & key when strings
114197
hash_key : string key to encode, default to _default_hash_key
@@ -124,6 +207,9 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
124207
125208
"""
126209

210+
if not hasattr(vals, 'dtype'):
211+
raise TypeError("must pass a ndarray-like")
212+
127213
if hash_key is None:
128214
hash_key = _default_hash_key
129215

@@ -142,9 +228,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
142228
# manage it.
143229
if is_bool_array(vals):
144230
vals = vals.astype('u8')
145-
elif ((is_datetime64_dtype(vals) or
146-
is_timedelta64_dtype(vals) or
147-
is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8):
231+
elif (is_datetime64_dtype(vals) or
232+
is_timedelta64_dtype(vals)):
233+
vals = vals.view('i8').astype('u8', copy=False)
234+
elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8):
148235
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
149236
else:
150237
# With repeated values, its MUCH faster to categorize object dtypes,
@@ -156,7 +243,12 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
156243
ordered=False, fastpath=True)
157244
return _hash_categorical(cat, encoding, hash_key)
158245

159-
vals = _hash.hash_object_array(vals, hash_key, encoding)
246+
try:
247+
vals = _hash.hash_object_array(vals, hash_key, encoding)
248+
except TypeError:
249+
# we have mixed types
250+
vals = _hash.hash_object_array(vals.astype(str).astype(object),
251+
hash_key, encoding)
160252

161253
# Then, redistribute these 64-bit ints within the space of 64-bit ints
162254
vals ^= vals >> 30

pandas/tools/tests/test_hashing.py

+52-22
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import numpy as np
22
import pandas as pd
33

4-
from pandas import DataFrame, Series, Index
5-
from pandas.tools.hashing import hash_array, hash_pandas_object
4+
from pandas import DataFrame, Series, Index, MultiIndex
5+
from pandas.tools.hashing import hash_array, hash_tuples, hash_pandas_object
66
import pandas.util.testing as tm
77

88

@@ -36,6 +36,18 @@ def test_hash_array(self):
3636
a = s.values
3737
tm.assert_numpy_array_equal(hash_array(a), hash_array(a))
3838

39+
def test_hash_array_mixed(self):
40+
result1 = hash_array(np.array([3, 4, 'All']))
41+
result2 = hash_array(np.array(['3', '4', 'All']))
42+
result3 = hash_array(np.array([3, 4, 'All'], dtype=object))
43+
tm.assert_numpy_array_equal(result1, result2)
44+
tm.assert_numpy_array_equal(result1, result3)
45+
46+
def test_hash_array_errors(self):
47+
48+
for val in [5, 'foo', pd.Timestamp('20130101')]:
49+
self.assertRaises(TypeError, hash_array, val)
50+
3951
def check_equal(self, obj, **kwargs):
4052
a = hash_pandas_object(obj, **kwargs)
4153
b = hash_pandas_object(obj, **kwargs)
@@ -53,7 +65,29 @@ def check_not_equal_with_index(self, obj):
5365
if not isinstance(obj, Index):
5466
a = hash_pandas_object(obj, index=True)
5567
b = hash_pandas_object(obj, index=False)
56-
self.assertFalse((a == b).all())
68+
if len(obj):
69+
self.assertFalse((a == b).all())
70+
71+
def test_hash_tuples(self):
72+
tups = [(1, 'one'), (1, 'two'), (2, 'one')]
73+
result = hash_tuples(tups)
74+
expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
75+
self.assert_numpy_array_equal(result, expected)
76+
77+
result = hash_tuples(tups[0])
78+
self.assertEqual(result, expected[0])
79+
80+
def test_hash_tuples_err(self):
81+
82+
for val in [5, 'foo', pd.Timestamp('20130101')]:
83+
self.assertRaises(TypeError, hash_tuples, val)
84+
85+
def test_multiindex_unique(self):
86+
mi = MultiIndex.from_tuples([(118, 472), (236, 118),
87+
(51, 204), (102, 51)])
88+
self.assertTrue(mi.is_unique)
89+
result = hash_pandas_object(mi)
90+
self.assertTrue(result.is_unique)
5791

5892
def test_hash_pandas_object(self):
5993

@@ -65,14 +99,27 @@ def test_hash_pandas_object(self):
6599
Series(['a', np.nan, 'c']),
66100
Series(['a', None, 'c']),
67101
Series([True, False, True]),
102+
Series(),
68103
Index([1, 2, 3]),
69104
Index([True, False, True]),
70105
DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
106+
DataFrame(),
71107
tm.makeMissingDataframe(),
72108
tm.makeMixedDataFrame(),
73109
tm.makeTimeDataFrame(),
74110
tm.makeTimeSeries(),
75-
tm.makeTimedeltaIndex()]:
111+
tm.makeTimedeltaIndex(),
112+
tm.makePeriodIndex(),
113+
Series(tm.makePeriodIndex()),
114+
Series(pd.date_range('20130101',
115+
periods=3, tz='US/Eastern')),
116+
MultiIndex.from_product(
117+
[range(5),
118+
['foo', 'bar', 'baz'],
119+
pd.date_range('20130101', periods=2)]),
120+
MultiIndex.from_product(
121+
[pd.CategoricalIndex(list('aabc')),
122+
range(3)])]:
76123
self.check_equal(obj)
77124
self.check_not_equal_with_index(obj)
78125

@@ -107,7 +154,7 @@ def test_categorical_consistency(self):
107154
tm.assert_series_equal(h1, h2)
108155
tm.assert_series_equal(h1, h3)
109156

110-
def test_errors(self):
157+
def test_pandas_errors(self):
111158

112159
for obj in [pd.Timestamp('20130101'), tm.makePanel()]:
113160
def f():
@@ -131,23 +178,6 @@ def f():
131178
hash_pandas_object(Series(list('abc')), hash_key='foo')
132179
self.assertRaises(ValueError, f)
133180

134-
def test_unsupported_objects(self):
135-
136-
# mixed objects are not supported
137-
obj = Series(['1', 2, 3])
138-
139-
def f():
140-
hash_pandas_object(obj)
141-
self.assertRaises(TypeError, f)
142-
143-
# MultiIndex are represented as tuples
144-
obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
145-
[('a', 1), ('a', 2), ('b', 1)]))
146-
147-
def f():
148-
hash_pandas_object(obj)
149-
self.assertRaises(TypeError, f)
150-
151181
def test_alread_encoded(self):
152182
# if already encoded then ok
153183

0 commit comments

Comments
 (0)