ENH: support MultiIndex and tuple hashing

jreback · jreback · commit e507c4a790c4 · 2017-01-26T10:01:17.000-05:00
diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py
@@ -3,11 +3,12 @@
 """
 
 import numpy as np
-from pandas import _hash, Series, factorize, Categorical, Index
+from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex
 from pandas.lib import is_bool_array
 from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
 from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
-                                 is_datetime64_dtype, is_timedelta64_dtype)
+                                 is_datetime64_dtype, is_timedelta64_dtype,
+                                 is_object_dtype)
 
 # 16 byte long hashing key
 _default_hash_key = '0123456789123456'
@@ -45,6 +46,9 @@ def adder(h, hashed_to_add):
         h = np.multiply(h, np.uint(3), h)
         return np.add(h, hashed_to_add, h)
 
+    if isinstance(obj, MultiIndex):
+        return _hash_tuples(obj, encoding, hash_key)
+
     if isinstance(obj, ABCIndexClass):
         h = hash_array(obj.values, encoding, hash_key,
                        categorize).astype('uint64')
@@ -80,6 +84,30 @@ def adder(h, hashed_to_add):
     return h
 
 
+def _hash_tuples(vals, encoding, hash_key):
+    """
+    Hash an MultiIndex / array_of_tuples efficiently
+
+    Parameters
+    ----------
+    vals : MultiIndex or ndarray of tuples
+    encoding : string, default 'utf8'
+    hash_key : string key to encode, default to _default_hash_key
+
+    Returns
+    -------
+    ndarray of hashed values array, same size as len(c)
+    """
+
+    if not isinstance(vals, MultiIndex):
+        vals = MultiIndex.from_tuples(vals)
+
+    # efficiently turn us into a DataFrame and hash
+    return hash_pandas_object(vals.to_dataframe(index=False),
+                              index=False, encoding=encoding,
+                              hash_key=hash_key, categorize=False)
+
+
 def _hash_categorical(c, encoding, hash_key):
     """
     Hash a Categorical by hashing its categories, and then mapping the codes
@@ -127,6 +155,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
     if hash_key is None:
         hash_key = _default_hash_key
 
+    if isinstance(vals, list) and len(vals) and isinstance(vals[0], tuple):
+        # we hash an list of tuples similar to a MultiIndex
+        return _hash_tuples(vals, encoding, hash_key).values
+
     # For categoricals, we hash the categories, then remap the codes to the
     # hash values. (This check is above the complex check so that we don't ask
     # numpy if categorical is a subdtype of complex, as it will choke.
diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pandas as pd
 
-from pandas import DataFrame, Series, Index
+from pandas import DataFrame, Series, Index, MultiIndex
 from pandas.tools.hashing import hash_array, hash_pandas_object
 import pandas.util.testing as tm
 
@@ -55,6 +55,18 @@ def check_not_equal_with_index(self, obj):
             b = hash_pandas_object(obj, index=False)
             self.assertFalse((a == b).all())
 
+    def test_hash_list_tuples(self):
+        tups = [(1, 'one'), (1, 'two'), (2, 'one')]
+        result = hash_array(tups)
+        expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
+        self.assert_numpy_array_equal(result, expected)
+
+    def test_multiindex_unique(self):
+        mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)])
+        self.assertTrue(mi.is_unique)
+        result = hash_pandas_object(mi)
+        self.assertTrue(result.is_unique)
+
     def test_hash_pandas_object(self):
 
         for obj in [Series([1, 2, 3]),
@@ -72,7 +84,11 @@ def test_hash_pandas_object(self):
                     tm.makeMixedDataFrame(),
                     tm.makeTimeDataFrame(),
                     tm.makeTimeSeries(),
-                    tm.makeTimedeltaIndex()]:
+                    tm.makeTimedeltaIndex(),
+                    MultiIndex.from_product(
+                        [range(5),
+                         ['foo', 'bar', 'baz'],
+                         pd.date_range('20130101', periods=2)])]:
             self.check_equal(obj)
             self.check_not_equal_with_index(obj)
 
@@ -140,14 +156,6 @@ def f():
             hash_pandas_object(obj)
         self.assertRaises(TypeError, f)
 
-        # MultiIndex are represented as tuples
-        obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
-            [('a', 1), ('a', 2), ('b', 1)]))
-
-        def f():
-            hash_pandas_object(obj)
-        self.assertRaises(TypeError, f)
-
     def test_alread_encoded(self):
         # if already encoded then ok