Categoricals hash consistently

jcrist · jcrist · commit 7878c5508375 · 2017-01-16T16:47:22.000-06:00
Previously categorical values were hashed using just their codes. This
meant that the hash value depended on the ordering of the categories,
rather than on the values the series represented. This caused problems
in dask, where different partitions might have different categorical
mappings.

This PR makes the hashing dependent on the values the categorical
represents, rather than on the codes. The categories are first hashed,
and then the codes are remapped to the hashed values. This is slightly
slower than before (still need to hash the categories, where we didn't
before), but allows for more consistent hashing.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -309,6 +309,7 @@ Bug Fixes
 - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
 - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
 - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
+- Bug in ``pandas.tools.hashing.hash_pandas_object`` in which hashing of categoricals depended on the ordering of categories, instead of just their values.
 
 - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)
 
@@ -369,4 +370,4 @@ Bug Fixes
 - Bug in ``Series`` constructor when both ``copy=True`` and ``dtype`` arguments are provided (:issue:`15125`)
 - Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`)
 
-- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`)
+- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`)
diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 from pandas import _hash, Series, factorize, Categorical, Index
-from pandas.lib import infer_dtype
+from pandas.lib import is_bool_array
 from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
 from pandas.types.common import is_categorical_dtype
 
@@ -68,7 +68,7 @@ def adder(h, hashed_to_add):
     return h
 
 
-def hash_array(vals, encoding='utf8', hash_key=None):
+def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
     """
     Given a 1d array, return an array of deterministic integers.
 
@@ -80,53 +80,56 @@ def hash_array(vals, encoding='utf8', hash_key=None):
     encoding : string, default 'utf8'
         encoding for data & key when strings
     hash_key : string key to encode, default to _default_hash_key
+    categorize : bool, default True
+        Whether to first categorize object arrays before hashing. This is more
+        efficient when the array contains duplicate values.
 
     Returns
     -------
     1d uint64 numpy array of hash values, same length as the vals
 
     """
 
-    # work with cagegoricals as ints. (This check is above the complex
-    # check so that we don't ask numpy if categorical is a subdtype of
-    # complex, as it will choke.
     if hash_key is None:
         hash_key = _default_hash_key
 
+    # For categoricals, we hash the categories, then remap the codes to the
+    # hash values. (This check is above the complex check so that we don't ask
+    # numpy if categorical is a subdtype of complex, as it will choke.
     if is_categorical_dtype(vals.dtype):
-        vals = vals.codes
+        cat_hashed = hash_array(vals.categories.values, encoding, hash_key,
+                                categorize=False).astype(np.uint64, copy=False)
+        # Since `cat_hashed` is already distributed in the space of uint64s,
+        # we can just return after remapping the codes here
+        c = Series(vals)
+        return c.cat.rename_categories(cat_hashed).values.astype(np.uint64)
 
     # we'll be working with everything as 64-bit values, so handle this
     # 128-bit value early
     if np.issubdtype(vals.dtype, np.complex128):
         return hash_array(vals.real) + 23 * hash_array(vals.imag)
 
-    # MAIN LOGIC:
-    inferred = infer_dtype(vals)
-
     # First, turn whatever array this is into unsigned 64-bit ints, if we can
     # manage it.
-    if inferred == 'boolean':
+    if is_bool_array(vals):
         vals = vals.astype('u8')
-
-    if (np.issubdtype(vals.dtype, np.datetime64) or
-       np.issubdtype(vals.dtype, np.timedelta64) or
-       np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
-
+    elif (np.issubdtype(vals.dtype, np.datetime64) or
+          np.issubdtype(vals.dtype, np.timedelta64) or
+          np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
         vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
     else:
-
-        # its MUCH faster to categorize object dtypes, then hash and rename
-        codes, categories = factorize(vals, sort=False)
-        categories = Index(categories)
-        c = Series(Categorical(codes, categories,
-                               ordered=False, fastpath=True))
-        vals = _hash.hash_object_array(categories.values,
-                                       hash_key,
-                                       encoding)
-
-        # rename & extract
-        vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values
+        # With repeated values, its MUCH faster to categorize object dtypes,
+        # then hash and rename categories. We allow skipping the categorization
+        # when the values are known/likely to be unique.
+        if categorize:
+            codes, categories = factorize(vals, sort=False)
+            c = Series(Categorical(codes, Index(categories),
+                                   ordered=False, fastpath=True))
+            vals = _hash.hash_object_array(categories, hash_key, encoding)
+            # rename & extract
+            vals = c.cat.rename_categories(vals).values.astype(np.uint64)
+        else:
+            vals = _hash.hash_object_array(vals, hash_key, encoding)
 
     # Then, redistribute these 64-bit ints within the space of 64-bit ints
     vals ^= vals >> 30
diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py
@@ -90,6 +90,20 @@ def test_hash_pandas_empty_object(self):
             # these are by-definition the same with
             # or w/o the index as the data is empty
 
+    def test_categorical_consistency(self):
+        # Check that categoricals hash consistent with their values, not codes
+        # This should work for categoricals of any dtype
+        for data in [['a', 'b', 'c', 'd'], [1000, 2000, 3000, 4000]]:
+            s1 = Series(data)
+            s2 = s1.astype('category').cat.set_categories(data)
+            s3 = s2.cat.set_categories(list(reversed(data)))
+            # These should all hash identically
+            h1 = hash_pandas_object(s1)
+            h2 = hash_pandas_object(s2)
+            h3 = hash_pandas_object(s3)
+            tm.assert_series_equal(h1, h2)
+            tm.assert_series_equal(h1, h3)
+
     def test_errors(self):
 
         for obj in [pd.Timestamp('20130101'), tm.makePanel()]: