diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2dc15f2fe0781..626ed0b1bac61 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -145,6 +145,7 @@ Other enhancements - ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`) - ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`). - ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`). +- ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index 6d2186fdab34c..a62c80c6f8d67 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -1,18 +1,49 @@ """ data hash pandas / numpy objects """ +import itertools import numpy as np -from pandas import _hash, Series, factorize, Categorical, Index +from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex +import pandas.core.algorithms as algos from pandas.lib import is_bool_array from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame from pandas.types.common import (is_categorical_dtype, is_numeric_dtype, - is_datetime64_dtype, is_timedelta64_dtype) + is_datetime64_dtype, is_timedelta64_dtype, + is_list_like) # 16 byte long hashing key _default_hash_key = '0123456789123456' +def _combine_hash_arrays(arrays, num_items): + """ + Parameters + ---------- + arrays : generator + num_items : int + + Should be the same as CPython's tupleobject.c + """ + try: + first = next(arrays) + except StopIteration: + return np.array([], dtype=np.uint64) + + arrays = itertools.chain([first], arrays) + + mult = np.uint64(1000003) + out = np.zeros_like(first) + np.uint64(0x345678) + for i, a in enumerate(arrays): + inverse_i = num_items - i + out ^= a + out *= mult + mult += np.uint64(82520 + inverse_i + inverse_i) + assert i + 1 == num_items, 'Fed in wrong num_items' + out += np.uint64(97531) + return out + + def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, categorize=True): """ @@ -41,45 +72,97 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, if hash_key is None: hash_key = _default_hash_key - def adder(h, hashed_to_add): - h = np.multiply(h, np.uint(3), h) - return np.add(h, hashed_to_add, h) + if isinstance(obj, MultiIndex): + return Series(hash_tuples(obj, encoding, hash_key), + dtype='uint64', copy=False) if isinstance(obj, ABCIndexClass): h = hash_array(obj.values, encoding, hash_key, - categorize).astype('uint64') - h = Series(h, index=obj, dtype='uint64') + categorize).astype('uint64', copy=False) + h = Series(h, index=obj, dtype='uint64', copy=False) elif isinstance(obj, ABCSeries): h = hash_array(obj.values, encoding, hash_key, - categorize).astype('uint64') + categorize).astype('uint64', copy=False) if index: - h = adder(h, hash_pandas_object(obj.index, - index=False, - encoding=encoding, - hash_key=hash_key, - categorize=categorize).values) - h = Series(h, index=obj.index, dtype='uint64') + index_iter = (hash_pandas_object(obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize).values + for _ in [None]) + arrays = itertools.chain([h], index_iter) + h = _combine_hash_arrays(arrays, 2) + + h = Series(h, index=obj.index, dtype='uint64', copy=False) + elif isinstance(obj, ABCDataFrame): - cols = obj.iteritems() - first_series = next(cols)[1] - h = hash_array(first_series.values, encoding, - hash_key, categorize).astype('uint64') - for _, col in cols: - h = adder(h, hash_array(col.values, encoding, hash_key, - categorize)) + hashes = (hash_array(series.values) for _, series in obj.iteritems()) + num_items = len(obj.columns) if index: - h = adder(h, hash_pandas_object(obj.index, - index=False, - encoding=encoding, - hash_key=hash_key, - categorize=categorize).values) + index_hash_generator = (hash_pandas_object(obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize).values # noqa + for _ in [None]) + num_items += 1 + hashes = itertools.chain(hashes, index_hash_generator) + h = _combine_hash_arrays(hashes, num_items) - h = Series(h, index=obj.index, dtype='uint64') + h = Series(h, index=obj.index, dtype='uint64', copy=False) else: raise TypeError("Unexpected type for hashing %s" % type(obj)) return h +def hash_tuples(vals, encoding='utf8', hash_key=None): + """ + Hash an MultiIndex / list-of-tuples efficiently + + .. versionadded:: 0.20.0 + + Parameters + ---------- + vals : MultiIndex, list-of-tuples, or single tuple + encoding : string, default 'utf8' + hash_key : string key to encode, default to _default_hash_key + + Returns + ------- + ndarray of hashed values array + """ + + is_tuple = False + if isinstance(vals, tuple): + vals = [vals] + is_tuple = True + elif not is_list_like(vals): + raise TypeError("must be convertible to a list-of-tuples") + + if not isinstance(vals, MultiIndex): + vals = MultiIndex.from_tuples(vals) + + # create a list-of-ndarrays + def get_level_values(num): + unique = vals.levels[num] # .values + labels = vals.labels[num] + filled = algos.take_1d(unique._values, labels, + fill_value=unique._na_value) + return filled + + vals = [get_level_values(level) + for level in range(vals.nlevels)] + + # hash the list-of-ndarrays + hashes = (hash_array(l, encoding=encoding, hash_key=hash_key) + for l in vals) + h = _combine_hash_arrays(hashes, len(vals)) + if is_tuple: + h = h[0] + + return h + + def _hash_categorical(c, encoding, hash_key): """ Hash a Categorical by hashing its categories, and then mapping the codes @@ -97,7 +180,7 @@ def _hash_categorical(c, encoding, hash_key): """ cat_hashed = hash_array(c.categories.values, encoding, hash_key, categorize=False).astype(np.uint64, copy=False) - return c.rename_categories(cat_hashed).astype(np.uint64) + return c.rename_categories(cat_hashed).astype(np.uint64, copy=False) def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): @@ -142,9 +225,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # manage it. if is_bool_array(vals): vals = vals.astype('u8') - elif ((is_datetime64_dtype(vals) or - is_timedelta64_dtype(vals) or - is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8): + elif (is_datetime64_dtype(vals) or + is_timedelta64_dtype(vals)): + vals = vals.view('i8').astype('u8', copy=False) + elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, @@ -156,7 +240,13 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) - vals = _hash.hash_object_array(vals, hash_key, encoding) + try: + vals = _hash.hash_object_array(vals, hash_key, encoding) + except TypeError: + + # we have mixed types + vals = _hash.hash_object_array(vals.astype(str).astype(object), + hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 7913706f5658b..ed5a74f8cfcf2 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -1,8 +1,8 @@ import numpy as np import pandas as pd -from pandas import DataFrame, Series, Index -from pandas.tools.hashing import hash_array, hash_pandas_object +from pandas import DataFrame, Series, Index, MultiIndex +from pandas.tools.hashing import hash_array, hash_tuples, hash_pandas_object import pandas.util.testing as tm @@ -36,6 +36,11 @@ def test_hash_array(self): a = s.values tm.assert_numpy_array_equal(hash_array(a), hash_array(a)) + def test_hash_array_mixed(self): + for data in [np.array([3, 4, 'All']), + np.array([3, 4, 'All'], dtype=object)]: + tm.assert_numpy_array_equal(hash_array(data), hash_array(data)) + def check_equal(self, obj, **kwargs): a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) @@ -53,7 +58,29 @@ def check_not_equal_with_index(self, obj): if not isinstance(obj, Index): a = hash_pandas_object(obj, index=True) b = hash_pandas_object(obj, index=False) - self.assertFalse((a == b).all()) + if len(obj): + self.assertFalse((a == b).all()) + + def test_hash_tuples(self): + tups = [(1, 'one'), (1, 'two'), (2, 'one')] + result = hash_tuples(tups) + expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values + self.assert_numpy_array_equal(result, expected) + + result = hash_tuples(tups[0]) + self.assertEqual(result, expected[0]) + + def test_hash_tuples_err(self): + + for val in [5, 'foo', pd.Timestamp('20130101')]: + self.assertRaises(TypeError, hash_tuples, val) + + def test_multiindex_unique(self): + mi = MultiIndex.from_tuples([(118, 472), (236, 118), + (51, 204), (102, 51)]) + self.assertTrue(mi.is_unique) + result = hash_pandas_object(mi) + self.assertTrue(result.is_unique) def test_hash_pandas_object(self): @@ -65,14 +92,27 @@ def test_hash_pandas_object(self): Series(['a', np.nan, 'c']), Series(['a', None, 'c']), Series([True, False, True]), + Series(), Index([1, 2, 3]), Index([True, False, True]), DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}), + DataFrame(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), - tm.makeTimedeltaIndex()]: + tm.makeTimedeltaIndex(), + tm.makePeriodIndex(), + Series(tm.makePeriodIndex()), + Series(pd.date_range('20130101', + periods=3, tz='US/Eastern')), + MultiIndex.from_product( + [range(5), + ['foo', 'bar', 'baz'], + pd.date_range('20130101', periods=2)]), + MultiIndex.from_product( + [pd.CategoricalIndex(list('aabc')), + range(3)])]: self.check_equal(obj) self.check_not_equal_with_index(obj) @@ -131,23 +171,6 @@ def f(): hash_pandas_object(Series(list('abc')), hash_key='foo') self.assertRaises(ValueError, f) - def test_unsupported_objects(self): - - # mixed objects are not supported - obj = Series(['1', 2, 3]) - - def f(): - hash_pandas_object(obj) - self.assertRaises(TypeError, f) - - # MultiIndex are represented as tuples - obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples( - [('a', 1), ('a', 2), ('b', 1)])) - - def f(): - hash_pandas_object(obj) - self.assertRaises(TypeError, f) - def test_alread_encoded(self): # if already encoded then ok