Skip to content

Commit 2e291d6

Browse files
committed
ENH: support MultiIndex and tuple hashing
1 parent 595e5e8 commit 2e291d6

File tree

2 files changed

+52
-12
lines changed

2 files changed

+52
-12
lines changed

pandas/tools/hashing.py

+34-2
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
"""
44

55
import numpy as np
6-
from pandas import _hash, Series, factorize, Categorical, Index
6+
from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex
77
from pandas.lib import is_bool_array
88
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
99
from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
10-
is_datetime64_dtype, is_timedelta64_dtype)
10+
is_datetime64_dtype, is_timedelta64_dtype,
11+
is_object_dtype)
1112

1213
# 16 byte long hashing key
1314
_default_hash_key = '0123456789123456'
@@ -45,6 +46,9 @@ def adder(h, hashed_to_add):
4546
h = np.multiply(h, np.uint(3), h)
4647
return np.add(h, hashed_to_add, h)
4748

49+
if isinstance(obj, MultiIndex):
50+
return _hash_tuples(obj, encoding, hash_key)
51+
4852
if isinstance(obj, ABCIndexClass):
4953
h = hash_array(obj.values, encoding, hash_key,
5054
categorize).astype('uint64')
@@ -80,6 +84,30 @@ def adder(h, hashed_to_add):
8084
return h
8185

8286

87+
def _hash_tuples(vals, encoding, hash_key):
88+
"""
89+
Hash an MultiIndex / array_of_tuples efficiently
90+
91+
Parameters
92+
----------
93+
vals : MultiIndex or ndarray of tuples
94+
encoding : string, default 'utf8'
95+
hash_key : string key to encode, default to _default_hash_key
96+
97+
Returns
98+
-------
99+
ndarray of hashed values array, same size as len(c)
100+
"""
101+
102+
if not isinstance(vals, MultiIndex):
103+
vals = MultiIndex.from_tuples(vals)
104+
105+
# efficiently turn us into a DataFrame and hash
106+
return hash_pandas_object(vals.to_dataframe(index=False),
107+
index=False, encoding=encoding,
108+
hash_key=hash_key, categorize=False)
109+
110+
83111
def _hash_categorical(c, encoding, hash_key):
84112
"""
85113
Hash a Categorical by hashing its categories, and then mapping the codes
@@ -127,6 +155,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
127155
if hash_key is None:
128156
hash_key = _default_hash_key
129157

158+
if isinstance(vals, list) and len(vals) and isinstance(vals[0], tuple):
159+
# we hash an list of tuples similar to a MultiIndex
160+
return _hash_tuples(vals, encoding, hash_key).values
161+
130162
# For categoricals, we hash the categories, then remap the codes to the
131163
# hash values. (This check is above the complex check so that we don't ask
132164
# numpy if categorical is a subdtype of complex, as it will choke.

pandas/tools/tests/test_hashing.py

+18-10
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import numpy as np
22
import pandas as pd
33

4-
from pandas import DataFrame, Series, Index
4+
from pandas import DataFrame, Series, Index, MultiIndex
55
from pandas.tools.hashing import hash_array, hash_pandas_object
66
import pandas.util.testing as tm
77

@@ -55,6 +55,18 @@ def check_not_equal_with_index(self, obj):
5555
b = hash_pandas_object(obj, index=False)
5656
self.assertFalse((a == b).all())
5757

58+
def test_hash_list_tuples(self):
59+
tups = [(1, 'one'), (1, 'two'), (2, 'one')]
60+
result = hash_array(tups)
61+
expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
62+
self.assert_numpy_array_equal(result, expected)
63+
64+
def test_multiindex_unique(self):
65+
mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)])
66+
self.assertTrue(mi.is_unique)
67+
result = hash_pandas_object(mi)
68+
self.assertTrue(result.is_unique)
69+
5870
def test_hash_pandas_object(self):
5971

6072
for obj in [Series([1, 2, 3]),
@@ -72,7 +84,11 @@ def test_hash_pandas_object(self):
7284
tm.makeMixedDataFrame(),
7385
tm.makeTimeDataFrame(),
7486
tm.makeTimeSeries(),
75-
tm.makeTimedeltaIndex()]:
87+
tm.makeTimedeltaIndex(),
88+
MultiIndex.from_product(
89+
[range(5),
90+
['foo', 'bar', 'baz'],
91+
pd.date_range('20130101', periods=2)])]:
7692
self.check_equal(obj)
7793
self.check_not_equal_with_index(obj)
7894

@@ -140,14 +156,6 @@ def f():
140156
hash_pandas_object(obj)
141157
self.assertRaises(TypeError, f)
142158

143-
# MultiIndex are represented as tuples
144-
obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
145-
[('a', 1), ('a', 2), ('b', 1)]))
146-
147-
def f():
148-
hash_pandas_object(obj)
149-
self.assertRaises(TypeError, f)
150-
151159
def test_alread_encoded(self):
152160
# if already encoded then ok
153161

0 commit comments

Comments
 (0)