Skip to content

Commit 7878c55

Browse files
committed
Categoricals hash consistently
Previously categorical values were hashed using just their codes. This meant that the hash value depended on the ordering of the categories, rather than on the values the series represented. This caused problems in dask, where different partitions might have different categorical mappings. This PR makes the hashing dependent on the values the categorical represents, rather than on the codes. The categories are first hashed, and then the codes are remapped to the hashed values. This is slightly slower than before (still need to hash the categories, where we didn't before), but allows for more consistent hashing.
1 parent 0e219d7 commit 7878c55

File tree

3 files changed

+46
-28
lines changed

3 files changed

+46
-28
lines changed

doc/source/whatsnew/v0.20.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,7 @@ Bug Fixes
309309
- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
310310
- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
311311
- Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
312+
- Bug in ``pandas.tools.hashing.hash_pandas_object`` in which hashing of categoricals depended on the ordering of categories, instead of just their values.
312313

313314
- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)
314315

@@ -369,4 +370,4 @@ Bug Fixes
369370
- Bug in ``Series`` constructor when both ``copy=True`` and ``dtype`` arguments are provided (:issue:`15125`)
370371
- Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`)
371372

372-
- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`)
373+
- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`)

pandas/tools/hashing.py

+30-27
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import numpy as np
66
from pandas import _hash, Series, factorize, Categorical, Index
7-
from pandas.lib import infer_dtype
7+
from pandas.lib import is_bool_array
88
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
99
from pandas.types.common import is_categorical_dtype
1010

@@ -68,7 +68,7 @@ def adder(h, hashed_to_add):
6868
return h
6969

7070

71-
def hash_array(vals, encoding='utf8', hash_key=None):
71+
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
7272
"""
7373
Given a 1d array, return an array of deterministic integers.
7474
@@ -80,53 +80,56 @@ def hash_array(vals, encoding='utf8', hash_key=None):
8080
encoding : string, default 'utf8'
8181
encoding for data & key when strings
8282
hash_key : string key to encode, default to _default_hash_key
83+
categorize : bool, default True
84+
Whether to first categorize object arrays before hashing. This is more
85+
efficient when the array contains duplicate values.
8386
8487
Returns
8588
-------
8689
1d uint64 numpy array of hash values, same length as the vals
8790
8891
"""
8992

90-
# work with cagegoricals as ints. (This check is above the complex
91-
# check so that we don't ask numpy if categorical is a subdtype of
92-
# complex, as it will choke.
9393
if hash_key is None:
9494
hash_key = _default_hash_key
9595

96+
# For categoricals, we hash the categories, then remap the codes to the
97+
# hash values. (This check is above the complex check so that we don't ask
98+
# numpy if categorical is a subdtype of complex, as it will choke.
9699
if is_categorical_dtype(vals.dtype):
97-
vals = vals.codes
100+
cat_hashed = hash_array(vals.categories.values, encoding, hash_key,
101+
categorize=False).astype(np.uint64, copy=False)
102+
# Since `cat_hashed` is already distributed in the space of uint64s,
103+
# we can just return after remapping the codes here
104+
c = Series(vals)
105+
return c.cat.rename_categories(cat_hashed).values.astype(np.uint64)
98106

99107
# we'll be working with everything as 64-bit values, so handle this
100108
# 128-bit value early
101109
if np.issubdtype(vals.dtype, np.complex128):
102110
return hash_array(vals.real) + 23 * hash_array(vals.imag)
103111

104-
# MAIN LOGIC:
105-
inferred = infer_dtype(vals)
106-
107112
# First, turn whatever array this is into unsigned 64-bit ints, if we can
108113
# manage it.
109-
if inferred == 'boolean':
114+
if is_bool_array(vals):
110115
vals = vals.astype('u8')
111-
112-
if (np.issubdtype(vals.dtype, np.datetime64) or
113-
np.issubdtype(vals.dtype, np.timedelta64) or
114-
np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
115-
116+
elif (np.issubdtype(vals.dtype, np.datetime64) or
117+
np.issubdtype(vals.dtype, np.timedelta64) or
118+
np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
116119
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
117120
else:
118-
119-
# its MUCH faster to categorize object dtypes, then hash and rename
120-
codes, categories = factorize(vals, sort=False)
121-
categories = Index(categories)
122-
c = Series(Categorical(codes, categories,
123-
ordered=False, fastpath=True))
124-
vals = _hash.hash_object_array(categories.values,
125-
hash_key,
126-
encoding)
127-
128-
# rename & extract
129-
vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values
121+
# With repeated values, its MUCH faster to categorize object dtypes,
122+
# then hash and rename categories. We allow skipping the categorization
123+
# when the values are known/likely to be unique.
124+
if categorize:
125+
codes, categories = factorize(vals, sort=False)
126+
c = Series(Categorical(codes, Index(categories),
127+
ordered=False, fastpath=True))
128+
vals = _hash.hash_object_array(categories, hash_key, encoding)
129+
# rename & extract
130+
vals = c.cat.rename_categories(vals).values.astype(np.uint64)
131+
else:
132+
vals = _hash.hash_object_array(vals, hash_key, encoding)
130133

131134
# Then, redistribute these 64-bit ints within the space of 64-bit ints
132135
vals ^= vals >> 30

pandas/tools/tests/test_hashing.py

+14
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,20 @@ def test_hash_pandas_empty_object(self):
9090
# these are by-definition the same with
9191
# or w/o the index as the data is empty
9292

93+
def test_categorical_consistency(self):
94+
# Check that categoricals hash consistent with their values, not codes
95+
# This should work for categoricals of any dtype
96+
for data in [['a', 'b', 'c', 'd'], [1000, 2000, 3000, 4000]]:
97+
s1 = Series(data)
98+
s2 = s1.astype('category').cat.set_categories(data)
99+
s3 = s2.cat.set_categories(list(reversed(data)))
100+
# These should all hash identically
101+
h1 = hash_pandas_object(s1)
102+
h2 = hash_pandas_object(s2)
103+
h3 = hash_pandas_object(s3)
104+
tm.assert_series_equal(h1, h2)
105+
tm.assert_series_equal(h1, h3)
106+
93107
def test_errors(self):
94108

95109
for obj in [pd.Timestamp('20130101'), tm.makePanel()]:

0 commit comments

Comments
 (0)