Skip to content

ENH: support MultiIndex and tuple hashing #15224

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ Other enhancements
- ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`)
- ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`).
- ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`).
- ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`)

.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations

Expand Down
154 changes: 122 additions & 32 deletions pandas/tools/hashing.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,49 @@
"""
data hash pandas / numpy objects
"""
import itertools

import numpy as np
from pandas import _hash, Series, factorize, Categorical, Index
from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex
import pandas.core.algorithms as algos
from pandas.lib import is_bool_array
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
is_datetime64_dtype, is_timedelta64_dtype)
is_datetime64_dtype, is_timedelta64_dtype,
is_list_like)

# 16 byte long hashing key
_default_hash_key = '0123456789123456'


def _combine_hash_arrays(arrays, num_items):
"""
Parameters
----------
arrays : generator
num_items : int

Should be the same as CPython's tupleobject.c
"""
try:
first = next(arrays)
except StopIteration:
return np.array([], dtype=np.uint64)

arrays = itertools.chain([first], arrays)

mult = np.uint64(1000003)
out = np.zeros_like(first) + np.uint64(0x345678)
for i, a in enumerate(arrays):
inverse_i = num_items - i
out ^= a
out *= mult
mult += np.uint64(82520 + inverse_i + inverse_i)
assert i + 1 == num_items, 'Fed in wrong num_items'
out += np.uint64(97531)
return out


def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
categorize=True):
"""
Expand Down Expand Up @@ -41,45 +72,97 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
if hash_key is None:
hash_key = _default_hash_key

def adder(h, hashed_to_add):
h = np.multiply(h, np.uint(3), h)
return np.add(h, hashed_to_add, h)
if isinstance(obj, MultiIndex):
return Series(hash_tuples(obj, encoding, hash_key),
dtype='uint64', copy=False)

if isinstance(obj, ABCIndexClass):
h = hash_array(obj.values, encoding, hash_key,
categorize).astype('uint64')
h = Series(h, index=obj, dtype='uint64')
categorize).astype('uint64', copy=False)
h = Series(h, index=obj, dtype='uint64', copy=False)
elif isinstance(obj, ABCSeries):
h = hash_array(obj.values, encoding, hash_key,
categorize).astype('uint64')
categorize).astype('uint64', copy=False)
if index:
h = adder(h, hash_pandas_object(obj.index,
index=False,
encoding=encoding,
hash_key=hash_key,
categorize=categorize).values)
h = Series(h, index=obj.index, dtype='uint64')
index_iter = (hash_pandas_object(obj.index,
index=False,
encoding=encoding,
hash_key=hash_key,
categorize=categorize).values
for _ in [None])
arrays = itertools.chain([h], index_iter)
h = _combine_hash_arrays(arrays, 2)

h = Series(h, index=obj.index, dtype='uint64', copy=False)

elif isinstance(obj, ABCDataFrame):
cols = obj.iteritems()
first_series = next(cols)[1]
h = hash_array(first_series.values, encoding,
hash_key, categorize).astype('uint64')
for _, col in cols:
h = adder(h, hash_array(col.values, encoding, hash_key,
categorize))
hashes = (hash_array(series.values) for _, series in obj.iteritems())
num_items = len(obj.columns)
if index:
h = adder(h, hash_pandas_object(obj.index,
index=False,
encoding=encoding,
hash_key=hash_key,
categorize=categorize).values)
index_hash_generator = (hash_pandas_object(obj.index,
index=False,
encoding=encoding,
hash_key=hash_key,
categorize=categorize).values # noqa
for _ in [None])
num_items += 1
hashes = itertools.chain(hashes, index_hash_generator)
h = _combine_hash_arrays(hashes, num_items)

h = Series(h, index=obj.index, dtype='uint64')
h = Series(h, index=obj.index, dtype='uint64', copy=False)
else:
raise TypeError("Unexpected type for hashing %s" % type(obj))
return h


def hash_tuples(vals, encoding='utf8', hash_key=None):
"""
Hash an MultiIndex / list-of-tuples efficiently

.. versionadded:: 0.20.0

Parameters
----------
vals : MultiIndex, list-of-tuples, or single tuple
encoding : string, default 'utf8'
hash_key : string key to encode, default to _default_hash_key

Returns
-------
ndarray of hashed values array
"""

is_tuple = False
if isinstance(vals, tuple):
vals = [vals]
is_tuple = True
elif not is_list_like(vals):
raise TypeError("must be convertible to a list-of-tuples")

if not isinstance(vals, MultiIndex):
vals = MultiIndex.from_tuples(vals)

# create a list-of-ndarrays
def get_level_values(num):
unique = vals.levels[num] # .values
labels = vals.labels[num]
filled = algos.take_1d(unique._values, labels,
fill_value=unique._na_value)
return filled

vals = [get_level_values(level)
for level in range(vals.nlevels)]

# hash the list-of-ndarrays
hashes = (hash_array(l, encoding=encoding, hash_key=hash_key)
for l in vals)
h = _combine_hash_arrays(hashes, len(vals))
if is_tuple:
h = h[0]

return h


def _hash_categorical(c, encoding, hash_key):
"""
Hash a Categorical by hashing its categories, and then mapping the codes
Expand All @@ -97,7 +180,7 @@ def _hash_categorical(c, encoding, hash_key):
"""
cat_hashed = hash_array(c.categories.values, encoding, hash_key,
categorize=False).astype(np.uint64, copy=False)
return c.rename_categories(cat_hashed).astype(np.uint64)
return c.rename_categories(cat_hashed).astype(np.uint64, copy=False)


def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
Expand Down Expand Up @@ -142,9 +225,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
# manage it.
if is_bool_array(vals):
vals = vals.astype('u8')
elif ((is_datetime64_dtype(vals) or
is_timedelta64_dtype(vals) or
is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8):
elif (is_datetime64_dtype(vals) or
is_timedelta64_dtype(vals)):
vals = vals.view('i8').astype('u8', copy=False)
elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8):
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
else:
# With repeated values, its MUCH faster to categorize object dtypes,
Expand All @@ -156,7 +240,13 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
ordered=False, fastpath=True)
return _hash_categorical(cat, encoding, hash_key)

vals = _hash.hash_object_array(vals, hash_key, encoding)
try:
vals = _hash.hash_object_array(vals, hash_key, encoding)
except TypeError:

# we have mixed types
vals = _hash.hash_object_array(vals.astype(str).astype(object),
hash_key, encoding)

# Then, redistribute these 64-bit ints within the space of 64-bit ints
vals ^= vals >> 30
Expand Down
65 changes: 44 additions & 21 deletions pandas/tools/tests/test_hashing.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import numpy as np
import pandas as pd

from pandas import DataFrame, Series, Index
from pandas.tools.hashing import hash_array, hash_pandas_object
from pandas import DataFrame, Series, Index, MultiIndex
from pandas.tools.hashing import hash_array, hash_tuples, hash_pandas_object
import pandas.util.testing as tm


Expand Down Expand Up @@ -36,6 +36,11 @@ def test_hash_array(self):
a = s.values
tm.assert_numpy_array_equal(hash_array(a), hash_array(a))

def test_hash_array_mixed(self):
for data in [np.array([3, 4, 'All']),
np.array([3, 4, 'All'], dtype=object)]:
tm.assert_numpy_array_equal(hash_array(data), hash_array(data))

def check_equal(self, obj, **kwargs):
a = hash_pandas_object(obj, **kwargs)
b = hash_pandas_object(obj, **kwargs)
Expand All @@ -53,7 +58,29 @@ def check_not_equal_with_index(self, obj):
if not isinstance(obj, Index):
a = hash_pandas_object(obj, index=True)
b = hash_pandas_object(obj, index=False)
self.assertFalse((a == b).all())
if len(obj):
self.assertFalse((a == b).all())

def test_hash_tuples(self):
tups = [(1, 'one'), (1, 'two'), (2, 'one')]
result = hash_tuples(tups)
expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
self.assert_numpy_array_equal(result, expected)

result = hash_tuples(tups[0])
self.assertEqual(result, expected[0])

def test_hash_tuples_err(self):

for val in [5, 'foo', pd.Timestamp('20130101')]:
self.assertRaises(TypeError, hash_tuples, val)

def test_multiindex_unique(self):
mi = MultiIndex.from_tuples([(118, 472), (236, 118),
(51, 204), (102, 51)])
self.assertTrue(mi.is_unique)
result = hash_pandas_object(mi)
self.assertTrue(result.is_unique)

def test_hash_pandas_object(self):

Expand All @@ -65,14 +92,27 @@ def test_hash_pandas_object(self):
Series(['a', np.nan, 'c']),
Series(['a', None, 'c']),
Series([True, False, True]),
Series(),
Index([1, 2, 3]),
Index([True, False, True]),
DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
DataFrame(),
tm.makeMissingDataframe(),
tm.makeMixedDataFrame(),
tm.makeTimeDataFrame(),
tm.makeTimeSeries(),
tm.makeTimedeltaIndex()]:
tm.makeTimedeltaIndex(),
tm.makePeriodIndex(),
Series(tm.makePeriodIndex()),
Series(pd.date_range('20130101',
periods=3, tz='US/Eastern')),
MultiIndex.from_product(
[range(5),
['foo', 'bar', 'baz'],
pd.date_range('20130101', periods=2)]),
MultiIndex.from_product(
[pd.CategoricalIndex(list('aabc')),
range(3)])]:
self.check_equal(obj)
self.check_not_equal_with_index(obj)

Expand Down Expand Up @@ -131,23 +171,6 @@ def f():
hash_pandas_object(Series(list('abc')), hash_key='foo')
self.assertRaises(ValueError, f)

def test_unsupported_objects(self):

# mixed objects are not supported
obj = Series(['1', 2, 3])

def f():
hash_pandas_object(obj)
self.assertRaises(TypeError, f)

# MultiIndex are represented as tuples
obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
[('a', 1), ('a', 2), ('b', 1)]))

def f():
hash_pandas_object(obj)
self.assertRaises(TypeError, f)

def test_alread_encoded(self):
# if already encoded then ok

Expand Down