Skip to content

Commit fb58008

Browse files
committed
add hash_tuples
1 parent 4245023 commit fb58008

File tree

3 files changed

+74
-24
lines changed

3 files changed

+74
-24
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ Other enhancements
145145
- ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`)
146146
- ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`).
147147
- ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`).
148+
- ``pandas.tools.hashing`` has gained a ``hash_tuples`` routine, and ``hash_pandas_object`` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`)
148149

149150
.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
150151

pandas/tools/hashing.py

+67-21
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import numpy as np
77
from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex
8+
import pandas.core.algorithms as algos
89
from pandas.lib import is_bool_array
910
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
1011
from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
@@ -58,15 +59,16 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
5859
hash_key = _default_hash_key
5960

6061
if isinstance(obj, MultiIndex):
61-
return _hash_tuples(obj, encoding, hash_key)
62+
return Series(hash_tuples(obj, encoding, hash_key),
63+
dtype='uint64', copy=False)
6264

6365
if isinstance(obj, ABCIndexClass):
6466
h = hash_array(obj.values, encoding, hash_key,
65-
categorize).astype('uint64')
66-
h = Series(h, index=obj, dtype='uint64')
67+
categorize).astype('uint64', copy=False)
68+
h = Series(h, index=obj, dtype='uint64', copy=False)
6769
elif isinstance(obj, ABCSeries):
6870
h = hash_array(obj.values, encoding, hash_key,
69-
categorize).astype('uint64')
71+
categorize).astype('uint64', copy=False)
7072
if index:
7173
h = _combine_hash_arrays(iter([
7274
h,
@@ -76,7 +78,7 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
7678
hash_key=hash_key,
7779
categorize=categorize).values]),
7880
2)
79-
h = Series(h, index=obj.index, dtype='uint64')
81+
h = Series(h, index=obj.index, dtype='uint64', copy=False)
8082
elif isinstance(obj, ABCDataFrame):
8183
hashes = (hash_array(series.values) for _, series in obj.iteritems())
8284
num_items = len(obj.columns)
@@ -91,34 +93,81 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
9193
hashes = itertools.chain(hashes, index_hash_generator)
9294
h = _combine_hash_arrays(hashes, num_items)
9395

94-
h = Series(h, index=obj.index, dtype='uint64')
96+
h = Series(h, index=obj.index, dtype='uint64', copy=False)
9597
else:
9698
raise TypeError("Unexpected type for hashing %s" % type(obj))
9799
return h
98100

99101

100-
def _hash_tuples(vals, encoding, hash_key):
102+
def _hash_lists(vals, encoding='utf8', hash_key=None):
103+
"""
104+
105+
Parameters
106+
----------
107+
vals : list of ndarrays
108+
encoding : string, default 'utf8'
109+
encoding for data & key when strings
110+
hash_key : string key to encode, default to _default_hash_key
111+
112+
Returns
113+
-------
114+
1d uint64 numpy array of hash values, same length as the vals[0]
115+
"""
116+
117+
if not isinstance(vals, list):
118+
raise TypeError("only can accept lists")
119+
120+
if not len(vals):
121+
raise ValueError("must pass a non-zero length vals")
122+
123+
if not isinstance(vals[0], np.ndarray):
124+
raise ValueError("must pass a ndarray")
125+
126+
hashes = (hash_array(l, encoding=encoding, hash_key=hash_key)
127+
for l in vals)
128+
h = _combine_hash_arrays(hashes, len(vals))
129+
return h
130+
131+
132+
def hash_tuples(vals, encoding='utf8', hash_key=None):
101133
"""
102134
Hash an MultiIndex / array_of_tuples efficiently
103135
104136
Parameters
105137
----------
106-
vals : MultiIndex or ndarray of tuples
138+
vals : MultiIndex, ndarray of tuples, or single tuple
107139
encoding : string, default 'utf8'
108140
hash_key : string key to encode, default to _default_hash_key
109141
110142
Returns
111143
-------
112-
ndarray of hashed values array, same size as len(c)
144+
ndarray of hashed values array
113145
"""
114146

147+
is_tuple = False
148+
if isinstance(vals, tuple):
149+
vals = [vals]
150+
is_tuple = True
151+
115152
if not isinstance(vals, MultiIndex):
116153
vals = MultiIndex.from_tuples(vals)
117154

118-
# efficiently turn us into a DataFrame and hash
119-
return hash_pandas_object(vals.to_frame(index=False),
120-
index=False, encoding=encoding,
121-
hash_key=hash_key, categorize=False)
155+
# create a list-of-ndarrays & hash
156+
def get_level_values(num):
157+
unique = vals.levels[num] # .values
158+
labels = vals.labels[num]
159+
filled = algos.take_1d(unique.values, labels,
160+
fill_value=unique._na_value)
161+
return filled
162+
163+
vals = [get_level_values(level)
164+
for level in range(vals.nlevels)]
165+
166+
result = _hash_lists(vals, encoding=encoding, hash_key=hash_key)
167+
if is_tuple:
168+
result = result[0]
169+
170+
return result
122171

123172

124173
def _hash_categorical(c, encoding, hash_key):
@@ -138,7 +187,7 @@ def _hash_categorical(c, encoding, hash_key):
138187
"""
139188
cat_hashed = hash_array(c.categories.values, encoding, hash_key,
140189
categorize=False).astype(np.uint64, copy=False)
141-
return c.rename_categories(cat_hashed).astype(np.uint64)
190+
return c.rename_categories(cat_hashed).astype(np.uint64, copy=False)
142191

143192

144193
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
@@ -168,10 +217,6 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
168217
if hash_key is None:
169218
hash_key = _default_hash_key
170219

171-
if isinstance(vals, list) and len(vals) and isinstance(vals[0], tuple):
172-
# we hash an list of tuples similar to a MultiIndex
173-
return _hash_tuples(vals, encoding, hash_key).values
174-
175220
# For categoricals, we hash the categories, then remap the codes to the
176221
# hash values. (This check is above the complex check so that we don't ask
177222
# numpy if categorical is a subdtype of complex, as it will choke.
@@ -187,9 +232,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
187232
# manage it.
188233
if is_bool_array(vals):
189234
vals = vals.astype('u8')
190-
elif ((is_datetime64_dtype(vals) or
191-
is_timedelta64_dtype(vals) or
192-
is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8):
235+
elif (is_datetime64_dtype(vals) or
236+
is_timedelta64_dtype(vals)):
237+
vals = vals.view('i8').astype('u8', copy=False)
238+
elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8):
193239
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
194240
else:
195241
# With repeated values, its MUCH faster to categorize object dtypes,

pandas/tools/tests/test_hashing.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import pandas as pd
33

44
from pandas import DataFrame, Series, Index, MultiIndex
5-
from pandas.tools.hashing import hash_array, hash_pandas_object
5+
from pandas.tools.hashing import hash_array, hash_tuples, hash_pandas_object
66
import pandas.util.testing as tm
77

88

@@ -55,12 +55,15 @@ def check_not_equal_with_index(self, obj):
5555
b = hash_pandas_object(obj, index=False)
5656
self.assertFalse((a == b).all())
5757

58-
def test_hash_list_tuples(self):
58+
def test_hash_tuples(self):
5959
tups = [(1, 'one'), (1, 'two'), (2, 'one')]
60-
result = hash_array(tups)
60+
result = hash_tuples(tups)
6161
expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
6262
self.assert_numpy_array_equal(result, expected)
6363

64+
result = hash_tuples(tups[0])
65+
self.assertEqual(result, expected[0])
66+
6467
def test_multiindex_unique(self):
6568
mi = MultiIndex.from_tuples([(118, 472), (236, 118),
6669
(51, 204), (102, 51)])

0 commit comments

Comments
 (0)