Skip to content

Commit d096b06

Browse files
jcristAnkurDedania
authored andcommitted
BUG: Categoricals hash consistently
Previously categorical values were hashed using just their codes. This meant that the hash value depended on the ordering of the categories, rather than on the values the series represented. This caused problems in dask, where different partitions might have different categorical mappings. This PR makes the hashing dependent on the values the categorical represents, rather than on the codes. The categories are first hashed, and then the codes are remapped to the hashed values. This is slightly slower than before (still need to hash the categories, where we didn't before), but allows for more consistent hashing. Related to this work in dask: dask/dask#1877. Author: Jim Crist <[email protected]> Closes pandas-dev#15143 from jcrist/categories_hash_consistently and squashes the following commits: f1aea13 [Jim Crist] Address comments 7878c55 [Jim Crist] Categoricals hash consistently
1 parent 8b20316 commit d096b06

File tree

3 files changed

+83
-35
lines changed

3 files changed

+83
-35
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,7 @@ Bug Fixes
326326
- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
327327
- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
328328
- Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
329+
- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
329330

330331
- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)
331332

pandas/tools/hashing.py

+65-35
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,17 @@
44

55
import numpy as np
66
from pandas import _hash, Series, factorize, Categorical, Index
7-
from pandas.lib import infer_dtype
7+
from pandas.lib import is_bool_array
88
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
9-
from pandas.types.common import is_categorical_dtype
9+
from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
10+
is_datetime64_dtype, is_timedelta64_dtype)
1011

1112
# 16 byte long hashing key
1213
_default_hash_key = '0123456789123456'
1314

1415

15-
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
16+
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
17+
categorize=True):
1618
"""
1719
Return a data hash of the Index/Series/DataFrame
1820
@@ -25,6 +27,11 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
2527
encoding : string, default 'utf8'
2628
encoding for data & key when strings
2729
hash_key : string key to encode, default to _default_hash_key
30+
categorize : bool, default True
31+
Whether to first categorize object arrays before hashing. This is more
32+
efficient when the array contains duplicate values.
33+
34+
.. versionadded:: 0.20.0
2835
2936
Returns
3037
-------
@@ -39,36 +46,61 @@ def adder(h, hashed_to_add):
3946
return np.add(h, hashed_to_add, h)
4047

4148
if isinstance(obj, ABCIndexClass):
42-
h = hash_array(obj.values, encoding, hash_key).astype('uint64')
49+
h = hash_array(obj.values, encoding, hash_key,
50+
categorize).astype('uint64')
4351
h = Series(h, index=obj, dtype='uint64')
4452
elif isinstance(obj, ABCSeries):
45-
h = hash_array(obj.values, encoding, hash_key).astype('uint64')
53+
h = hash_array(obj.values, encoding, hash_key,
54+
categorize).astype('uint64')
4655
if index:
4756
h = adder(h, hash_pandas_object(obj.index,
4857
index=False,
4958
encoding=encoding,
50-
hash_key=hash_key).values)
59+
hash_key=hash_key,
60+
categorize=categorize).values)
5161
h = Series(h, index=obj.index, dtype='uint64')
5262
elif isinstance(obj, ABCDataFrame):
5363
cols = obj.iteritems()
5464
first_series = next(cols)[1]
5565
h = hash_array(first_series.values, encoding,
56-
hash_key).astype('uint64')
66+
hash_key, categorize).astype('uint64')
5767
for _, col in cols:
58-
h = adder(h, hash_array(col.values, encoding, hash_key))
68+
h = adder(h, hash_array(col.values, encoding, hash_key,
69+
categorize))
5970
if index:
6071
h = adder(h, hash_pandas_object(obj.index,
6172
index=False,
6273
encoding=encoding,
63-
hash_key=hash_key).values)
74+
hash_key=hash_key,
75+
categorize=categorize).values)
6476

6577
h = Series(h, index=obj.index, dtype='uint64')
6678
else:
6779
raise TypeError("Unexpected type for hashing %s" % type(obj))
6880
return h
6981

7082

71-
def hash_array(vals, encoding='utf8', hash_key=None):
83+
def _hash_categorical(c, encoding, hash_key):
84+
"""
85+
Hash a Categorical by hashing its categories, and then mapping the codes
86+
to the hashes
87+
88+
Parameters
89+
----------
90+
c : Categorical
91+
encoding : string, default 'utf8'
92+
hash_key : string key to encode, default to _default_hash_key
93+
94+
Returns
95+
-------
96+
ndarray of hashed values array, same size as len(c)
97+
"""
98+
cat_hashed = hash_array(c.categories.values, encoding, hash_key,
99+
categorize=False).astype(np.uint64, copy=False)
100+
return c.rename_categories(cat_hashed).astype(np.uint64)
101+
102+
103+
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
72104
"""
73105
Given a 1d array, return an array of deterministic integers.
74106
@@ -80,53 +112,51 @@ def hash_array(vals, encoding='utf8', hash_key=None):
80112
encoding : string, default 'utf8'
81113
encoding for data & key when strings
82114
hash_key : string key to encode, default to _default_hash_key
115+
categorize : bool, default True
116+
Whether to first categorize object arrays before hashing. This is more
117+
efficient when the array contains duplicate values.
118+
119+
.. versionadded:: 0.20.0
83120
84121
Returns
85122
-------
86123
1d uint64 numpy array of hash values, same length as the vals
87124
88125
"""
89126

90-
# work with cagegoricals as ints. (This check is above the complex
91-
# check so that we don't ask numpy if categorical is a subdtype of
92-
# complex, as it will choke.
93127
if hash_key is None:
94128
hash_key = _default_hash_key
95129

130+
# For categoricals, we hash the categories, then remap the codes to the
131+
# hash values. (This check is above the complex check so that we don't ask
132+
# numpy if categorical is a subdtype of complex, as it will choke.
96133
if is_categorical_dtype(vals.dtype):
97-
vals = vals.codes
134+
return _hash_categorical(vals, encoding, hash_key)
98135

99136
# we'll be working with everything as 64-bit values, so handle this
100137
# 128-bit value early
101138
if np.issubdtype(vals.dtype, np.complex128):
102139
return hash_array(vals.real) + 23 * hash_array(vals.imag)
103140

104-
# MAIN LOGIC:
105-
inferred = infer_dtype(vals)
106-
107141
# First, turn whatever array this is into unsigned 64-bit ints, if we can
108142
# manage it.
109-
if inferred == 'boolean':
143+
if is_bool_array(vals):
110144
vals = vals.astype('u8')
111-
112-
if (np.issubdtype(vals.dtype, np.datetime64) or
113-
np.issubdtype(vals.dtype, np.timedelta64) or
114-
np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
115-
145+
elif ((is_datetime64_dtype(vals) or
146+
is_timedelta64_dtype(vals) or
147+
is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8):
116148
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
117149
else:
118-
119-
# its MUCH faster to categorize object dtypes, then hash and rename
120-
codes, categories = factorize(vals, sort=False)
121-
categories = Index(categories)
122-
c = Series(Categorical(codes, categories,
123-
ordered=False, fastpath=True))
124-
vals = _hash.hash_object_array(categories.values,
125-
hash_key,
126-
encoding)
127-
128-
# rename & extract
129-
vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values
150+
# With repeated values, its MUCH faster to categorize object dtypes,
151+
# then hash and rename categories. We allow skipping the categorization
152+
# when the values are known/likely to be unique.
153+
if categorize:
154+
codes, categories = factorize(vals, sort=False)
155+
cat = Categorical(codes, Index(categories),
156+
ordered=False, fastpath=True)
157+
return _hash_categorical(cat, encoding, hash_key)
158+
159+
vals = _hash.hash_object_array(vals, hash_key, encoding)
130160

131161
# Then, redistribute these 64-bit ints within the space of 64-bit ints
132162
vals ^= vals >> 30

pandas/tools/tests/test_hashing.py

+17
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,23 @@ def test_hash_pandas_empty_object(self):
9090
# these are by-definition the same with
9191
# or w/o the index as the data is empty
9292

93+
def test_categorical_consistency(self):
94+
# GH15143
95+
# Check that categoricals hash consistent with their values, not codes
96+
# This should work for categoricals of any dtype
97+
for s1 in [Series(['a', 'b', 'c', 'd']),
98+
Series([1000, 2000, 3000, 4000]),
99+
Series(pd.date_range(0, periods=4))]:
100+
s2 = s1.astype('category').cat.set_categories(s1)
101+
s3 = s2.cat.set_categories(list(reversed(s1)))
102+
for categorize in [True, False]:
103+
# These should all hash identically
104+
h1 = hash_pandas_object(s1, categorize=categorize)
105+
h2 = hash_pandas_object(s2, categorize=categorize)
106+
h3 = hash_pandas_object(s3, categorize=categorize)
107+
tm.assert_series_equal(h1, h2)
108+
tm.assert_series_equal(h1, h3)
109+
93110
def test_errors(self):
94111

95112
for obj in [pd.Timestamp('20130101'), tm.makePanel()]:

0 commit comments

Comments
 (0)