Skip to content

Commit f1aea13

Browse files
committed
Address comments
- Add `categorize` parameter to `hash_pandas_object` - Update test - Update whatsnew
1 parent 7878c55 commit f1aea13

File tree

3 files changed

+45
-29
lines changed

3 files changed

+45
-29
lines changed

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ Bug Fixes
309309
- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
310310
- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
311311
- Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
312-
- Bug in ``pandas.tools.hashing.hash_pandas_object`` in which hashing of categoricals depended on the ordering of categories, instead of just their values.
312+
- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
313313

314314
- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)
315315

pandas/tools/hashing.py

+32-18
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
_default_hash_key = '0123456789123456'
1313

1414

15-
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
15+
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
16+
categorize=True):
1617
"""
1718
Return a data hash of the Index/Series/DataFrame
1819
@@ -25,6 +26,11 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
2526
encoding : string, default 'utf8'
2627
encoding for data & key when strings
2728
hash_key : string key to encode, default to _default_hash_key
29+
categorize : bool, default True
30+
Whether to first categorize object arrays before hashing. This is more
31+
efficient when the array contains duplicate values.
32+
33+
.. versionadded:: 0.20.0
2834
2935
Returns
3036
-------
@@ -39,35 +45,48 @@ def adder(h, hashed_to_add):
3945
return np.add(h, hashed_to_add, h)
4046

4147
if isinstance(obj, ABCIndexClass):
42-
h = hash_array(obj.values, encoding, hash_key).astype('uint64')
48+
h = hash_array(obj.values, encoding, hash_key,
49+
categorize).astype('uint64')
4350
h = Series(h, index=obj, dtype='uint64')
4451
elif isinstance(obj, ABCSeries):
45-
h = hash_array(obj.values, encoding, hash_key).astype('uint64')
52+
h = hash_array(obj.values, encoding, hash_key,
53+
categorize).astype('uint64')
4654
if index:
4755
h = adder(h, hash_pandas_object(obj.index,
4856
index=False,
4957
encoding=encoding,
50-
hash_key=hash_key).values)
58+
hash_key=hash_key,
59+
categorize=categorize).values)
5160
h = Series(h, index=obj.index, dtype='uint64')
5261
elif isinstance(obj, ABCDataFrame):
5362
cols = obj.iteritems()
5463
first_series = next(cols)[1]
5564
h = hash_array(first_series.values, encoding,
56-
hash_key).astype('uint64')
65+
hash_key, categorize).astype('uint64')
5766
for _, col in cols:
58-
h = adder(h, hash_array(col.values, encoding, hash_key))
67+
h = adder(h, hash_array(col.values, encoding, hash_key,
68+
categorize))
5969
if index:
6070
h = adder(h, hash_pandas_object(obj.index,
6171
index=False,
6272
encoding=encoding,
63-
hash_key=hash_key).values)
73+
hash_key=hash_key,
74+
categorize=categorize).values)
6475

6576
h = Series(h, index=obj.index, dtype='uint64')
6677
else:
6778
raise TypeError("Unexpected type for hashing %s" % type(obj))
6879
return h
6980

7081

82+
def _hash_categorical(c, encoding, hash_key):
83+
"""Hash a Categorical by hashing its categories, and then mapping the codes
84+
to the hashes"""
85+
cat_hashed = hash_array(c.categories.values, encoding, hash_key,
86+
categorize=False).astype(np.uint64, copy=False)
87+
return c.rename_categories(cat_hashed).astype(np.uint64)
88+
89+
7190
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
7291
"""
7392
Given a 1d array, return an array of deterministic integers.
@@ -84,6 +103,8 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
84103
Whether to first categorize object arrays before hashing. This is more
85104
efficient when the array contains duplicate values.
86105
106+
.. versionadded:: 0.20.0
107+
87108
Returns
88109
-------
89110
1d uint64 numpy array of hash values, same length as the vals
@@ -97,12 +118,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
97118
# hash values. (This check is above the complex check so that we don't ask
98119
# numpy if categorical is a subdtype of complex, as it will choke.
99120
if is_categorical_dtype(vals.dtype):
100-
cat_hashed = hash_array(vals.categories.values, encoding, hash_key,
101-
categorize=False).astype(np.uint64, copy=False)
102-
# Since `cat_hashed` is already distributed in the space of uint64s,
103-
# we can just return after remapping the codes here
104-
c = Series(vals)
105-
return c.cat.rename_categories(cat_hashed).values.astype(np.uint64)
121+
return _hash_categorical(vals, encoding, hash_key)
106122

107123
# we'll be working with everything as 64-bit values, so handle this
108124
# 128-bit value early
@@ -123,11 +139,9 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
123139
# when the values are known/likely to be unique.
124140
if categorize:
125141
codes, categories = factorize(vals, sort=False)
126-
c = Series(Categorical(codes, Index(categories),
127-
ordered=False, fastpath=True))
128-
vals = _hash.hash_object_array(categories, hash_key, encoding)
129-
# rename & extract
130-
vals = c.cat.rename_categories(vals).values.astype(np.uint64)
142+
cat = Categorical(codes, Index(categories),
143+
ordered=False, fastpath=True)
144+
return _hash_categorical(cat, encoding, hash_key)
131145
else:
132146
vals = _hash.hash_object_array(vals, hash_key, encoding)
133147

pandas/tools/tests/test_hashing.py

+12-10
Original file line numberDiff line numberDiff line change
@@ -93,16 +93,18 @@ def test_hash_pandas_empty_object(self):
9393
def test_categorical_consistency(self):
9494
# Check that categoricals hash consistent with their values, not codes
9595
# This should work for categoricals of any dtype
96-
for data in [['a', 'b', 'c', 'd'], [1000, 2000, 3000, 4000]]:
97-
s1 = Series(data)
98-
s2 = s1.astype('category').cat.set_categories(data)
99-
s3 = s2.cat.set_categories(list(reversed(data)))
100-
# These should all hash identically
101-
h1 = hash_pandas_object(s1)
102-
h2 = hash_pandas_object(s2)
103-
h3 = hash_pandas_object(s3)
104-
tm.assert_series_equal(h1, h2)
105-
tm.assert_series_equal(h1, h3)
96+
for s1 in [Series(['a', 'b', 'c', 'd']),
97+
Series([1000, 2000, 3000, 4000]),
98+
Series(pd.date_range(0, periods=4))]:
99+
s2 = s1.astype('category').cat.set_categories(s1)
100+
s3 = s2.cat.set_categories(list(reversed(s1)))
101+
for categorize in [True, False]:
102+
# These should all hash identically
103+
h1 = hash_pandas_object(s1, categorize=categorize)
104+
h2 = hash_pandas_object(s2, categorize=categorize)
105+
h3 = hash_pandas_object(s3, categorize=categorize)
106+
tm.assert_series_equal(h1, h2)
107+
tm.assert_series_equal(h1, h3)
106108

107109
def test_errors(self):
108110

0 commit comments

Comments
 (0)