Address comments

jcrist · jcrist · commit f1aea137e3e7 · 2017-01-17T13:31:09.000-06:00
- Add `categorize` parameter to `hash_pandas_object`
- Update test
- Update whatsnew
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -309,7 +309,7 @@ Bug Fixes
 - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
 - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
 - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
-- Bug in ``pandas.tools.hashing.hash_pandas_object`` in which hashing of categoricals depended on the ordering of categories, instead of just their values.
+- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
 
 - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)
 
diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py
@@ -12,7 +12,8 @@
 _default_hash_key = '0123456789123456'
 
 
-def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
+def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
+                       categorize=True):
     """
     Return a data hash of the Index/Series/DataFrame
 
@@ -25,6 +26,11 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
     encoding : string, default 'utf8'
         encoding for data & key when strings
     hash_key : string key to encode, default to _default_hash_key
+    categorize : bool, default True
+        Whether to first categorize object arrays before hashing. This is more
+        efficient when the array contains duplicate values.
+
+        .. versionadded:: 0.20.0
 
     Returns
     -------
@@ -39,35 +45,48 @@ def adder(h, hashed_to_add):
         return np.add(h, hashed_to_add, h)
 
     if isinstance(obj, ABCIndexClass):
-        h = hash_array(obj.values, encoding, hash_key).astype('uint64')
+        h = hash_array(obj.values, encoding, hash_key,
+                       categorize).astype('uint64')
         h = Series(h, index=obj, dtype='uint64')
     elif isinstance(obj, ABCSeries):
-        h = hash_array(obj.values, encoding, hash_key).astype('uint64')
+        h = hash_array(obj.values, encoding, hash_key,
+                       categorize).astype('uint64')
         if index:
             h = adder(h, hash_pandas_object(obj.index,
                                             index=False,
                                             encoding=encoding,
-                                            hash_key=hash_key).values)
+                                            hash_key=hash_key,
+                                            categorize=categorize).values)
         h = Series(h, index=obj.index, dtype='uint64')
     elif isinstance(obj, ABCDataFrame):
         cols = obj.iteritems()
         first_series = next(cols)[1]
         h = hash_array(first_series.values, encoding,
-                       hash_key).astype('uint64')
+                       hash_key, categorize).astype('uint64')
         for _, col in cols:
-            h = adder(h, hash_array(col.values, encoding, hash_key))
+            h = adder(h, hash_array(col.values, encoding, hash_key,
+                                    categorize))
         if index:
             h = adder(h, hash_pandas_object(obj.index,
                                             index=False,
                                             encoding=encoding,
-                                            hash_key=hash_key).values)
+                                            hash_key=hash_key,
+                                            categorize=categorize).values)
 
         h = Series(h, index=obj.index, dtype='uint64')
     else:
         raise TypeError("Unexpected type for hashing %s" % type(obj))
     return h
 
 
+def _hash_categorical(c, encoding, hash_key):
+    """Hash a Categorical by hashing its categories, and then mapping the codes
+    to the hashes"""
+    cat_hashed = hash_array(c.categories.values, encoding, hash_key,
+                            categorize=False).astype(np.uint64, copy=False)
+    return c.rename_categories(cat_hashed).astype(np.uint64)
+
+
 def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
     """
     Given a 1d array, return an array of deterministic integers.
@@ -84,6 +103,8 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
         Whether to first categorize object arrays before hashing. This is more
         efficient when the array contains duplicate values.
 
+        .. versionadded:: 0.20.0
+
     Returns
     -------
     1d uint64 numpy array of hash values, same length as the vals
@@ -97,12 +118,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
     # hash values. (This check is above the complex check so that we don't ask
     # numpy if categorical is a subdtype of complex, as it will choke.
     if is_categorical_dtype(vals.dtype):
-        cat_hashed = hash_array(vals.categories.values, encoding, hash_key,
-                                categorize=False).astype(np.uint64, copy=False)
-        # Since `cat_hashed` is already distributed in the space of uint64s,
-        # we can just return after remapping the codes here
-        c = Series(vals)
-        return c.cat.rename_categories(cat_hashed).values.astype(np.uint64)
+        return _hash_categorical(vals, encoding, hash_key)
 
     # we'll be working with everything as 64-bit values, so handle this
     # 128-bit value early
@@ -123,11 +139,9 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
         # when the values are known/likely to be unique.
         if categorize:
             codes, categories = factorize(vals, sort=False)
-            c = Series(Categorical(codes, Index(categories),
-                                   ordered=False, fastpath=True))
-            vals = _hash.hash_object_array(categories, hash_key, encoding)
-            # rename & extract
-            vals = c.cat.rename_categories(vals).values.astype(np.uint64)
+            cat = Categorical(codes, Index(categories),
+                              ordered=False, fastpath=True)
+            return _hash_categorical(cat, encoding, hash_key)
         else:
             vals = _hash.hash_object_array(vals, hash_key, encoding)
 
diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py
@@ -93,16 +93,18 @@ def test_hash_pandas_empty_object(self):
     def test_categorical_consistency(self):
         # Check that categoricals hash consistent with their values, not codes
         # This should work for categoricals of any dtype
-        for data in [['a', 'b', 'c', 'd'], [1000, 2000, 3000, 4000]]:
-            s1 = Series(data)
-            s2 = s1.astype('category').cat.set_categories(data)
-            s3 = s2.cat.set_categories(list(reversed(data)))
-            # These should all hash identically
-            h1 = hash_pandas_object(s1)
-            h2 = hash_pandas_object(s2)
-            h3 = hash_pandas_object(s3)
-            tm.assert_series_equal(h1, h2)
-            tm.assert_series_equal(h1, h3)
+        for s1 in [Series(['a', 'b', 'c', 'd']),
+                   Series([1000, 2000, 3000, 4000]),
+                   Series(pd.date_range(0, periods=4))]:
+            s2 = s1.astype('category').cat.set_categories(s1)
+            s3 = s2.cat.set_categories(list(reversed(s1)))
+            for categorize in [True, False]:
+                # These should all hash identically
+                h1 = hash_pandas_object(s1, categorize=categorize)
+                h2 = hash_pandas_object(s2, categorize=categorize)
+                h3 = hash_pandas_object(s3, categorize=categorize)
+                tm.assert_series_equal(h1, h2)
+                tm.assert_series_equal(h1, h3)
 
     def test_errors(self):