BUG: Categoricals hash consistently

jcrist · AnkurDedania · commit d096b060c1bb · 2017-03-21T11:16:52.000-05:00
Previously categorical values were hashed using just their codes. This meant that the hash value depended on the ordering of the categories, rather than on the values the series represented. This caused problems in dask, where different partitions might have different categorical mappings. This PR makes the hashing dependent on the values the categorical represents, rather than on the codes. The categories are first hashed, and then the codes are remapped to the hashed values. This is slightly slower than before (still need to hash the categories, where we didn't before), but allows for more consistent hashing. Related to this work in dask: dask/dask#1877. Author: Jim Crist <crist042@umn.edu> Closes pandas-dev#15143 from jcrist/categories_hash_consistently and squashes the following commits: f1aea13 [Jim Crist] Address comments 7878c55 [Jim Crist] Categoricals hash consistently
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -326,6 +326,7 @@ Bug Fixes
 - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
 - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
 - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
+- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
 
 - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)
 
diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py
@@ -4,15 +4,17 @@
 
 import numpy as np
 from pandas import _hash, Series, factorize, Categorical, Index
-from pandas.lib import infer_dtype
+from pandas.lib import is_bool_array
 from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
-from pandas.types.common import is_categorical_dtype
+from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
+                                 is_datetime64_dtype, is_timedelta64_dtype)
 
 # 16 byte long hashing key
 _default_hash_key = '0123456789123456'
 
 
-def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
+def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
+                       categorize=True):
     """
     Return a data hash of the Index/Series/DataFrame
 
@@ -25,6 +27,11 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
     encoding : string, default 'utf8'
         encoding for data & key when strings
     hash_key : string key to encode, default to _default_hash_key
+    categorize : bool, default True
+        Whether to first categorize object arrays before hashing. This is more
+        efficient when the array contains duplicate values.
+
+        .. versionadded:: 0.20.0
 
     Returns
     -------
@@ -39,36 +46,61 @@ def adder(h, hashed_to_add):
         return np.add(h, hashed_to_add, h)
 
     if isinstance(obj, ABCIndexClass):
-        h = hash_array(obj.values, encoding, hash_key).astype('uint64')
+        h = hash_array(obj.values, encoding, hash_key,
+                       categorize).astype('uint64')
         h = Series(h, index=obj, dtype='uint64')
     elif isinstance(obj, ABCSeries):
-        h = hash_array(obj.values, encoding, hash_key).astype('uint64')
+        h = hash_array(obj.values, encoding, hash_key,
+                       categorize).astype('uint64')
         if index:
             h = adder(h, hash_pandas_object(obj.index,
                                             index=False,
                                             encoding=encoding,
-                                            hash_key=hash_key).values)
+                                            hash_key=hash_key,
+                                            categorize=categorize).values)
         h = Series(h, index=obj.index, dtype='uint64')
     elif isinstance(obj, ABCDataFrame):
         cols = obj.iteritems()
         first_series = next(cols)[1]
         h = hash_array(first_series.values, encoding,
-                       hash_key).astype('uint64')
+                       hash_key, categorize).astype('uint64')
         for _, col in cols:
-            h = adder(h, hash_array(col.values, encoding, hash_key))
+            h = adder(h, hash_array(col.values, encoding, hash_key,
+                                    categorize))
         if index:
             h = adder(h, hash_pandas_object(obj.index,
                                             index=False,
                                             encoding=encoding,
-                                            hash_key=hash_key).values)
+                                            hash_key=hash_key,
+                                            categorize=categorize).values)
 
         h = Series(h, index=obj.index, dtype='uint64')
     else:
         raise TypeError("Unexpected type for hashing %s" % type(obj))
     return h
 
 
-def hash_array(vals, encoding='utf8', hash_key=None):
+def _hash_categorical(c, encoding, hash_key):
+    """
+    Hash a Categorical by hashing its categories, and then mapping the codes
+    to the hashes
+
+    Parameters
+    ----------
+    c : Categorical
+    encoding : string, default 'utf8'
+    hash_key : string key to encode, default to _default_hash_key
+
+    Returns
+    -------
+    ndarray of hashed values array, same size as len(c)
+    """
+    cat_hashed = hash_array(c.categories.values, encoding, hash_key,
+                            categorize=False).astype(np.uint64, copy=False)
+    return c.rename_categories(cat_hashed).astype(np.uint64)
+
+
+def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
     """
     Given a 1d array, return an array of deterministic integers.
 
@@ -80,53 +112,51 @@ def hash_array(vals, encoding='utf8', hash_key=None):
     encoding : string, default 'utf8'
         encoding for data & key when strings
     hash_key : string key to encode, default to _default_hash_key
+    categorize : bool, default True
+        Whether to first categorize object arrays before hashing. This is more
+        efficient when the array contains duplicate values.
+
+        .. versionadded:: 0.20.0
 
     Returns
     -------
     1d uint64 numpy array of hash values, same length as the vals
 
     """
 
-    # work with cagegoricals as ints. (This check is above the complex
-    # check so that we don't ask numpy if categorical is a subdtype of
-    # complex, as it will choke.
     if hash_key is None:
         hash_key = _default_hash_key
 
+    # For categoricals, we hash the categories, then remap the codes to the
+    # hash values. (This check is above the complex check so that we don't ask
+    # numpy if categorical is a subdtype of complex, as it will choke.
     if is_categorical_dtype(vals.dtype):
-        vals = vals.codes
+        return _hash_categorical(vals, encoding, hash_key)
 
     # we'll be working with everything as 64-bit values, so handle this
     # 128-bit value early
     if np.issubdtype(vals.dtype, np.complex128):
         return hash_array(vals.real) + 23 * hash_array(vals.imag)
 
-    # MAIN LOGIC:
-    inferred = infer_dtype(vals)
-
     # First, turn whatever array this is into unsigned 64-bit ints, if we can
     # manage it.
-    if inferred == 'boolean':
+    if is_bool_array(vals):
         vals = vals.astype('u8')
-
-    if (np.issubdtype(vals.dtype, np.datetime64) or
-       np.issubdtype(vals.dtype, np.timedelta64) or
-       np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
-
+    elif ((is_datetime64_dtype(vals) or
+           is_timedelta64_dtype(vals) or
+           is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8):
         vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
     else:
-
-        # its MUCH faster to categorize object dtypes, then hash and rename
-        codes, categories = factorize(vals, sort=False)
-        categories = Index(categories)
-        c = Series(Categorical(codes, categories,
-                               ordered=False, fastpath=True))
-        vals = _hash.hash_object_array(categories.values,
-                                       hash_key,
-                                       encoding)
-
-        # rename & extract
-        vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values
+        # With repeated values, its MUCH faster to categorize object dtypes,
+        # then hash and rename categories. We allow skipping the categorization
+        # when the values are known/likely to be unique.
+        if categorize:
+            codes, categories = factorize(vals, sort=False)
+            cat = Categorical(codes, Index(categories),
+                              ordered=False, fastpath=True)
+            return _hash_categorical(cat, encoding, hash_key)
+
+        vals = _hash.hash_object_array(vals, hash_key, encoding)
 
     # Then, redistribute these 64-bit ints within the space of 64-bit ints
     vals ^= vals >> 30
diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py
@@ -90,6 +90,23 @@ def test_hash_pandas_empty_object(self):
             # these are by-definition the same with
             # or w/o the index as the data is empty
 
+    def test_categorical_consistency(self):
+        # GH15143
+        # Check that categoricals hash consistent with their values, not codes
+        # This should work for categoricals of any dtype
+        for s1 in [Series(['a', 'b', 'c', 'd']),
+                   Series([1000, 2000, 3000, 4000]),
+                   Series(pd.date_range(0, periods=4))]:
+            s2 = s1.astype('category').cat.set_categories(s1)
+            s3 = s2.cat.set_categories(list(reversed(s1)))
+            for categorize in [True, False]:
+                # These should all hash identically
+                h1 = hash_pandas_object(s1, categorize=categorize)
+                h2 = hash_pandas_object(s2, categorize=categorize)
+                h3 = hash_pandas_object(s3, categorize=categorize)
+                tm.assert_series_equal(h1, h2)
+                tm.assert_series_equal(h1, h3)
+
     def test_errors(self):
 
         for obj in [pd.Timestamp('20130101'), tm.makePanel()]: