|
3 | 3 | """
|
4 | 4 |
|
5 | 5 | import numpy as np
|
6 |
| -from pandas import _hash, Series, factorize, Categorical, Index |
| 6 | +from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex |
7 | 7 | from pandas.lib import is_bool_array
|
8 | 8 | from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
|
9 | 9 | from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
|
10 |
| - is_datetime64_dtype, is_timedelta64_dtype) |
| 10 | + is_datetime64_dtype, is_timedelta64_dtype, |
| 11 | + is_object_dtype) |
11 | 12 |
|
12 | 13 | # 16 byte long hashing key
|
13 | 14 | _default_hash_key = '0123456789123456'
|
@@ -45,6 +46,9 @@ def adder(h, hashed_to_add):
|
45 | 46 | h = np.multiply(h, np.uint(3), h)
|
46 | 47 | return np.add(h, hashed_to_add, h)
|
47 | 48 |
|
| 49 | + if isinstance(obj, MultiIndex): |
| 50 | + return _hash_tuples(obj, encoding, hash_key) |
| 51 | + |
48 | 52 | if isinstance(obj, ABCIndexClass):
|
49 | 53 | h = hash_array(obj.values, encoding, hash_key,
|
50 | 54 | categorize).astype('uint64')
|
@@ -80,6 +84,30 @@ def adder(h, hashed_to_add):
|
80 | 84 | return h
|
81 | 85 |
|
82 | 86 |
|
| 87 | +def _hash_tuples(vals, encoding, hash_key): |
| 88 | + """ |
| 89 | + Hash an MultiIndex / array_of_tuples efficiently |
| 90 | +
|
| 91 | + Parameters |
| 92 | + ---------- |
| 93 | + vals : MultiIndex or ndarray of tuples |
| 94 | + encoding : string, default 'utf8' |
| 95 | + hash_key : string key to encode, default to _default_hash_key |
| 96 | +
|
| 97 | + Returns |
| 98 | + ------- |
| 99 | + ndarray of hashed values array, same size as len(c) |
| 100 | + """ |
| 101 | + |
| 102 | + if not isinstance(vals, MultiIndex): |
| 103 | + vals = MultiIndex.from_tuples(vals) |
| 104 | + |
| 105 | + # efficiently turn us into a DataFrame and hash |
| 106 | + return hash_pandas_object(vals.to_dataframe(index=False), |
| 107 | + index=False, encoding=encoding, |
| 108 | + hash_key=hash_key, categorize=False) |
| 109 | + |
| 110 | + |
83 | 111 | def _hash_categorical(c, encoding, hash_key):
|
84 | 112 | """
|
85 | 113 | Hash a Categorical by hashing its categories, and then mapping the codes
|
@@ -127,6 +155,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
|
127 | 155 | if hash_key is None:
|
128 | 156 | hash_key = _default_hash_key
|
129 | 157 |
|
| 158 | + if isinstance(vals, list) and len(vals) and isinstance(vals[0], tuple): |
| 159 | + # we hash an list of tuples similar to a MultiIndex |
| 160 | + return _hash_tuples(vals, encoding, hash_key).values |
| 161 | + |
130 | 162 | # For categoricals, we hash the categories, then remap the codes to the
|
131 | 163 | # hash values. (This check is above the complex check so that we don't ask
|
132 | 164 | # numpy if categorical is a subdtype of complex, as it will choke.
|
|
0 commit comments