ENH: support MultiIndex and tuple hashing

jreback · jreback · commit c67486f076f4 · 2017-01-27T09:19:43.000-05:00
closes #15227 Author: Jeff Reback <jeff@reback.net> Author: Mike Graham <mikegraham2gmail.com> Closes #15224 from jreback/mi_hash2 and squashes the following commits: 8b1d3f9 [Jeff Reback] not correctly hashing categorical in a MI 48a2402 [Jeff Reback] support for mixed type arrays 58f682d [Jeff Reback] memory optimization 0c13df7 [Mike Graham] Steal the algorithm used to combine hashes from tupleobject.c e8dd607 [Jeff Reback] add hash_tuples 44e9c7d [Mike Graham] wipSteal the algorithm used to combine hashes from tupleobject.c e507c4a [Jeff Reback] ENH: support MultiIndex and tuple hashing
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -145,6 +145,7 @@ Other enhancements
 - ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`)
 - ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`).
 - ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`).
+- ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`)
 
 .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
 
diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py
@@ -1,18 +1,49 @@
 """
 data hash pandas / numpy objects
 """
+import itertools
 
 import numpy as np
-from pandas import _hash, Series, factorize, Categorical, Index
+from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex
+import pandas.core.algorithms as algos
 from pandas.lib import is_bool_array
 from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
 from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
-                                 is_datetime64_dtype, is_timedelta64_dtype)
+                                 is_datetime64_dtype, is_timedelta64_dtype,
+                                 is_list_like)
 
 # 16 byte long hashing key
 _default_hash_key = '0123456789123456'
 
 
+def _combine_hash_arrays(arrays, num_items):
+    """
+    Parameters
+    ----------
+    arrays : generator
+    num_items : int
+
+    Should be the same as CPython's tupleobject.c
+    """
+    try:
+        first = next(arrays)
+    except StopIteration:
+        return np.array([], dtype=np.uint64)
+
+    arrays = itertools.chain([first], arrays)
+
+    mult = np.uint64(1000003)
+    out = np.zeros_like(first) + np.uint64(0x345678)
+    for i, a in enumerate(arrays):
+        inverse_i = num_items - i
+        out ^= a
+        out *= mult
+        mult += np.uint64(82520 + inverse_i + inverse_i)
+    assert i + 1 == num_items, 'Fed in wrong num_items'
+    out += np.uint64(97531)
+    return out
+
+
 def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
                        categorize=True):
     """
@@ -41,45 +72,97 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
     if hash_key is None:
         hash_key = _default_hash_key
 
-    def adder(h, hashed_to_add):
-        h = np.multiply(h, np.uint(3), h)
-        return np.add(h, hashed_to_add, h)
+    if isinstance(obj, MultiIndex):
+        return Series(hash_tuples(obj, encoding, hash_key),
+                      dtype='uint64', copy=False)
 
     if isinstance(obj, ABCIndexClass):
         h = hash_array(obj.values, encoding, hash_key,
-                       categorize).astype('uint64')
-        h = Series(h, index=obj, dtype='uint64')
+                       categorize).astype('uint64', copy=False)
+        h = Series(h, index=obj, dtype='uint64', copy=False)
     elif isinstance(obj, ABCSeries):
         h = hash_array(obj.values, encoding, hash_key,
-                       categorize).astype('uint64')
+                       categorize).astype('uint64', copy=False)
         if index:
-            h = adder(h, hash_pandas_object(obj.index,
-                                            index=False,
-                                            encoding=encoding,
-                                            hash_key=hash_key,
-                                            categorize=categorize).values)
-        h = Series(h, index=obj.index, dtype='uint64')
+            index_iter = (hash_pandas_object(obj.index,
+                                             index=False,
+                                             encoding=encoding,
+                                             hash_key=hash_key,
+                                             categorize=categorize).values
+                          for _ in [None])
+            arrays = itertools.chain([h], index_iter)
+            h = _combine_hash_arrays(arrays, 2)
+
+        h = Series(h, index=obj.index, dtype='uint64', copy=False)
+
     elif isinstance(obj, ABCDataFrame):
-        cols = obj.iteritems()
-        first_series = next(cols)[1]
-        h = hash_array(first_series.values, encoding,
-                       hash_key, categorize).astype('uint64')
-        for _, col in cols:
-            h = adder(h, hash_array(col.values, encoding, hash_key,
-                                    categorize))
+        hashes = (hash_array(series.values) for _, series in obj.iteritems())
+        num_items = len(obj.columns)
         if index:
-            h = adder(h, hash_pandas_object(obj.index,
-                                            index=False,
-                                            encoding=encoding,
-                                            hash_key=hash_key,
-                                            categorize=categorize).values)
+            index_hash_generator = (hash_pandas_object(obj.index,
+                                                       index=False,
+                                                       encoding=encoding,
+                                                       hash_key=hash_key,
+                                                       categorize=categorize).values  # noqa
+                                    for _ in [None])
+            num_items += 1
+            hashes = itertools.chain(hashes, index_hash_generator)
+        h = _combine_hash_arrays(hashes, num_items)
 
-        h = Series(h, index=obj.index, dtype='uint64')
+        h = Series(h, index=obj.index, dtype='uint64', copy=False)
     else:
         raise TypeError("Unexpected type for hashing %s" % type(obj))
     return h
 
 
+def hash_tuples(vals, encoding='utf8', hash_key=None):
+    """
+    Hash an MultiIndex / list-of-tuples efficiently
+
+    .. versionadded:: 0.20.0
+
+    Parameters
+    ----------
+    vals : MultiIndex, list-of-tuples, or single tuple
+    encoding : string, default 'utf8'
+    hash_key : string key to encode, default to _default_hash_key
+
+    Returns
+    -------
+    ndarray of hashed values array
+    """
+
+    is_tuple = False
+    if isinstance(vals, tuple):
+        vals = [vals]
+        is_tuple = True
+    elif not is_list_like(vals):
+        raise TypeError("must be convertible to a list-of-tuples")
+
+    if not isinstance(vals, MultiIndex):
+        vals = MultiIndex.from_tuples(vals)
+
+    # create a list-of-ndarrays
+    def get_level_values(num):
+        unique = vals.levels[num]  # .values
+        labels = vals.labels[num]
+        filled = algos.take_1d(unique._values, labels,
+                               fill_value=unique._na_value)
+        return filled
+
+    vals = [get_level_values(level)
+            for level in range(vals.nlevels)]
+
+    # hash the list-of-ndarrays
+    hashes = (hash_array(l, encoding=encoding, hash_key=hash_key)
+              for l in vals)
+    h = _combine_hash_arrays(hashes, len(vals))
+    if is_tuple:
+        h = h[0]
+
+    return h
+
+
 def _hash_categorical(c, encoding, hash_key):
     """
     Hash a Categorical by hashing its categories, and then mapping the codes
@@ -97,7 +180,7 @@ def _hash_categorical(c, encoding, hash_key):
     """
     cat_hashed = hash_array(c.categories.values, encoding, hash_key,
                             categorize=False).astype(np.uint64, copy=False)
-    return c.rename_categories(cat_hashed).astype(np.uint64)
+    return c.rename_categories(cat_hashed).astype(np.uint64, copy=False)
 
 
 def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
@@ -108,7 +191,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
 
     Parameters
     ----------
-    vals : ndarray
+    vals : ndarray, Categorical
     encoding : string, default 'utf8'
         encoding for data & key when strings
     hash_key : string key to encode, default to _default_hash_key
@@ -124,6 +207,9 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
 
     """
 
+    if not hasattr(vals, 'dtype'):
+        raise TypeError("must pass a ndarray-like")
+
     if hash_key is None:
         hash_key = _default_hash_key
 
@@ -142,9 +228,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
     # manage it.
     if is_bool_array(vals):
         vals = vals.astype('u8')
-    elif ((is_datetime64_dtype(vals) or
-           is_timedelta64_dtype(vals) or
-           is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8):
+    elif (is_datetime64_dtype(vals) or
+          is_timedelta64_dtype(vals)):
+        vals = vals.view('i8').astype('u8', copy=False)
+    elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8):
         vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
     else:
         # With repeated values, its MUCH faster to categorize object dtypes,
@@ -156,7 +243,12 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
                               ordered=False, fastpath=True)
             return _hash_categorical(cat, encoding, hash_key)
 
-        vals = _hash.hash_object_array(vals, hash_key, encoding)
+        try:
+            vals = _hash.hash_object_array(vals, hash_key, encoding)
+        except TypeError:
+            # we have mixed types
+            vals = _hash.hash_object_array(vals.astype(str).astype(object),
+                                           hash_key, encoding)
 
     # Then, redistribute these 64-bit ints within the space of 64-bit ints
     vals ^= vals >> 30
diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py
@@ -1,8 +1,8 @@
 import numpy as np
 import pandas as pd
 
-from pandas import DataFrame, Series, Index
-from pandas.tools.hashing import hash_array, hash_pandas_object
+from pandas import DataFrame, Series, Index, MultiIndex
+from pandas.tools.hashing import hash_array, hash_tuples, hash_pandas_object
 import pandas.util.testing as tm
 
 
@@ -36,6 +36,18 @@ def test_hash_array(self):
             a = s.values
             tm.assert_numpy_array_equal(hash_array(a), hash_array(a))
 
+    def test_hash_array_mixed(self):
+        result1 = hash_array(np.array([3, 4, 'All']))
+        result2 = hash_array(np.array(['3', '4', 'All']))
+        result3 = hash_array(np.array([3, 4, 'All'], dtype=object))
+        tm.assert_numpy_array_equal(result1, result2)
+        tm.assert_numpy_array_equal(result1, result3)
+
+    def test_hash_array_errors(self):
+
+        for val in [5, 'foo', pd.Timestamp('20130101')]:
+            self.assertRaises(TypeError, hash_array, val)
+
     def check_equal(self, obj, **kwargs):
         a = hash_pandas_object(obj, **kwargs)
         b = hash_pandas_object(obj, **kwargs)
@@ -53,7 +65,29 @@ def check_not_equal_with_index(self, obj):
         if not isinstance(obj, Index):
             a = hash_pandas_object(obj, index=True)
             b = hash_pandas_object(obj, index=False)
-            self.assertFalse((a == b).all())
+            if len(obj):
+                self.assertFalse((a == b).all())
+
+    def test_hash_tuples(self):
+        tups = [(1, 'one'), (1, 'two'), (2, 'one')]
+        result = hash_tuples(tups)
+        expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
+        self.assert_numpy_array_equal(result, expected)
+
+        result = hash_tuples(tups[0])
+        self.assertEqual(result, expected[0])
+
+    def test_hash_tuples_err(self):
+
+        for val in [5, 'foo', pd.Timestamp('20130101')]:
+            self.assertRaises(TypeError, hash_tuples, val)
+
+    def test_multiindex_unique(self):
+        mi = MultiIndex.from_tuples([(118, 472), (236, 118),
+                                     (51, 204), (102, 51)])
+        self.assertTrue(mi.is_unique)
+        result = hash_pandas_object(mi)
+        self.assertTrue(result.is_unique)
 
     def test_hash_pandas_object(self):
 
@@ -65,14 +99,27 @@ def test_hash_pandas_object(self):
                     Series(['a', np.nan, 'c']),
                     Series(['a', None, 'c']),
                     Series([True, False, True]),
+                    Series(),
                     Index([1, 2, 3]),
                     Index([True, False, True]),
                     DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
+                    DataFrame(),
                     tm.makeMissingDataframe(),
                     tm.makeMixedDataFrame(),
                     tm.makeTimeDataFrame(),
                     tm.makeTimeSeries(),
-                    tm.makeTimedeltaIndex()]:
+                    tm.makeTimedeltaIndex(),
+                    tm.makePeriodIndex(),
+                    Series(tm.makePeriodIndex()),
+                    Series(pd.date_range('20130101',
+                                         periods=3, tz='US/Eastern')),
+                    MultiIndex.from_product(
+                        [range(5),
+                         ['foo', 'bar', 'baz'],
+                         pd.date_range('20130101', periods=2)]),
+                    MultiIndex.from_product(
+                        [pd.CategoricalIndex(list('aabc')),
+                         range(3)])]:
             self.check_equal(obj)
             self.check_not_equal_with_index(obj)
 
@@ -107,7 +154,7 @@ def test_categorical_consistency(self):
                 tm.assert_series_equal(h1, h2)
                 tm.assert_series_equal(h1, h3)
 
-    def test_errors(self):
+    def test_pandas_errors(self):
 
         for obj in [pd.Timestamp('20130101'), tm.makePanel()]:
             def f():
@@ -131,23 +178,6 @@ def f():
             hash_pandas_object(Series(list('abc')), hash_key='foo')
         self.assertRaises(ValueError, f)
 
-    def test_unsupported_objects(self):
-
-        # mixed objects are not supported
-        obj = Series(['1', 2, 3])
-
-        def f():
-            hash_pandas_object(obj)
-        self.assertRaises(TypeError, f)
-
-        # MultiIndex are represented as tuples
-        obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
-            [('a', 1), ('a', 2), ('b', 1)]))
-
-        def f():
-            hash_pandas_object(obj)
-        self.assertRaises(TypeError, f)
-
     def test_alread_encoded(self):
         # if already encoded then ok