pandas-dev · jreback · Oct 11, 2018 · Oct 11, 2018
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -245,6 +245,15 @@ Top-level evaluation
 
    eval
 
+Hashing
+~~~~~~~
+
+.. autosummary::
+   :toctree: generated/
+
+   util.hash_array
+   util.hash_pandas_object
+
 Testing
 ~~~~~~~
 

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -548,6 +548,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
 - :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`).
 - Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
 - Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`)
+- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`)
 - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`)
 
 .. _whatsnew_0240.api.incompatibilities:

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -466,6 +466,11 @@ def _values_for_factorize(self):
             as NA in the factorization routines, so it will be coded as
             `na_sentinal` and not included in `uniques`. By default,
             ``np.nan`` is used.
+
+        Notes
+        -----
+        The values returned by this method are also used in
+        :func:`pandas.util.hash_pandas_object`.
         """
         return self.astype(object), np.nan
 

diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
@@ -11,7 +11,7 @@
     ABCSeries,
     ABCDataFrame)
 from pandas.core.dtypes.common import (
-    is_categorical_dtype, is_list_like)
+    is_categorical_dtype, is_list_like, is_extension_array_dtype)
 from pandas.core.dtypes.missing import isna
 from pandas.core.dtypes.cast import infer_dtype_from_scalar
 
@@ -265,10 +265,13 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
     # numpy if categorical is a subdtype of complex, as it will choke).
     if is_categorical_dtype(dtype):
         return _hash_categorical(vals, encoding, hash_key)
+    elif is_extension_array_dtype(dtype):
+        vals, _ = vals._values_for_factorize()
+        dtype = vals.dtype
 
     # we'll be working with everything as 64-bit values, so handle this
     # 128-bit value early
-    elif np.issubdtype(dtype, np.complex128):
+    if np.issubdtype(dtype, np.complex128):
         return hash_array(vals.real) + 23 * hash_array(vals.imag)
 
     # First, turn whatever array this is into unsigned 64-bit ints, if we can

diff --git a/pandas/tests/extension/base/base.py b/pandas/tests/extension/base/base.py
@@ -2,6 +2,7 @@
 
 
 class BaseExtensionTests(object):
+    assert_equal = staticmethod(tm.assert_equal)
     assert_series_equal = staticmethod(tm.assert_series_equal)
     assert_frame_equal = staticmethod(tm.assert_frame_equal)
     assert_extension_array_equal = staticmethod(

diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -164,3 +164,13 @@ def test_container_shift(self, data, frame, periods, indices):
             compare = self.assert_series_equal
 
         compare(result, expected)
+
+    @pytest.mark.parametrize("as_frame", [True, False])
+    def test_hash_pandas_object_works(self, data, as_frame):
+        # https://github.com/pandas-dev/pandas/issues/23066
+        data = pd.Series(data)
+        if as_frame:
+            data = data.to_frame()
+        a = pd.util.hash_pandas_object(data)
+        b = pd.util.hash_pandas_object(data)
+        self.assert_equal(a, b)
diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
@@ -199,6 +199,10 @@ def test_combine_le(self, data_repeated):
     def test_combine_add(self, data_repeated):
         pass
 
+    @unhashable
+    def test_hash_pandas_object_works(self, data, kind):
+        super().test_hash_pandas_object_works(data, kind)
+
 
 class TestCasting(BaseJSON, base.BaseCastingTests):
     @pytest.mark.skip(reason="failing on np.array(self, dtype=str)")