Skip to content

Commit 8b1d3f9

Browse files
committed
not correctly hashing categorical in a MI
1 parent 48a2402 commit 8b1d3f9

File tree

2 files changed

+16
-36
lines changed

2 files changed

+16
-36
lines changed

pandas/tools/hashing.py

+8-35
Original file line numberDiff line numberDiff line change
@@ -115,36 +115,6 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
115115
return h
116116

117117

118-
def _hash_lists(vals, encoding='utf8', hash_key=None):
119-
"""
120-
121-
Parameters
122-
----------
123-
vals : list of ndarrays
124-
encoding : string, default 'utf8'
125-
encoding for data & key when strings
126-
hash_key : string key to encode, default to _default_hash_key
127-
128-
Returns
129-
-------
130-
1d uint64 numpy array of hash values, same length as the vals[0]
131-
"""
132-
133-
if not isinstance(vals, list):
134-
raise TypeError("only can accept lists")
135-
136-
if not len(vals):
137-
raise ValueError("must pass a non-zero length vals")
138-
139-
if not isinstance(vals[0], np.ndarray):
140-
raise ValueError("must pass a ndarray")
141-
142-
hashes = (hash_array(l, encoding=encoding, hash_key=hash_key)
143-
for l in vals)
144-
h = _combine_hash_arrays(hashes, len(vals))
145-
return h
146-
147-
148118
def hash_tuples(vals, encoding='utf8', hash_key=None):
149119
"""
150120
Hash an MultiIndex / list-of-tuples efficiently
@@ -172,22 +142,25 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
172142
if not isinstance(vals, MultiIndex):
173143
vals = MultiIndex.from_tuples(vals)
174144

175-
# create a list-of-ndarrays & hash
145+
# create a list-of-ndarrays
176146
def get_level_values(num):
177147
unique = vals.levels[num] # .values
178148
labels = vals.labels[num]
179-
filled = algos.take_1d(unique.values, labels,
149+
filled = algos.take_1d(unique._values, labels,
180150
fill_value=unique._na_value)
181151
return filled
182152

183153
vals = [get_level_values(level)
184154
for level in range(vals.nlevels)]
185155

186-
result = _hash_lists(vals, encoding=encoding, hash_key=hash_key)
156+
# hash the list-of-ndarrays
157+
hashes = (hash_array(l, encoding=encoding, hash_key=hash_key)
158+
for l in vals)
159+
h = _combine_hash_arrays(hashes, len(vals))
187160
if is_tuple:
188-
result = result[0]
161+
h = h[0]
189162

190-
return result
163+
return h
191164

192165

193166
def _hash_categorical(c, encoding, hash_key):

pandas/tools/tests/test_hashing.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,17 @@ def test_hash_pandas_object(self):
102102
tm.makeTimeDataFrame(),
103103
tm.makeTimeSeries(),
104104
tm.makeTimedeltaIndex(),
105+
tm.makePeriodIndex(),
106+
Series(tm.makePeriodIndex()),
107+
Series(pd.date_range('20130101',
108+
periods=3, tz='US/Eastern')),
105109
MultiIndex.from_product(
106110
[range(5),
107111
['foo', 'bar', 'baz'],
108-
pd.date_range('20130101', periods=2)])]:
112+
pd.date_range('20130101', periods=2)]),
113+
MultiIndex.from_product(
114+
[pd.CategoricalIndex(list('aabc')),
115+
range(3)])]:
109116
self.check_equal(obj)
110117
self.check_not_equal_with_index(obj)
111118

0 commit comments

Comments
 (0)