Use _values_for_factorize by default for hashing ExtensionArrays (#53475)

jorisvandenbossche · web-flow · commit 2a1d2da13f45 · 2023-06-19T16:31:50.000+02:00
diff --git a/doc/source/whatsnew/v2.0.3.rst b/doc/source/whatsnew/v2.0.3.rst
@@ -14,6 +14,7 @@ including other versions of pandas.
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
 - Fixed performance regression in merging on datetime-like columns (:issue:`53231`)
+- For external ExtensionArray implementations, restored the default use of ``_values_for_factorize`` for hashing arrays (:issue:`53475`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -993,7 +993,6 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
         Returns
         -------
         values : ndarray
-
             An array suitable for factorization. This should maintain order
             and be a supported dtype (Float64, Int64, UInt64, String, Object).
             By default, the extension array is cast to object dtype.
@@ -1002,6 +1001,12 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
             as NA in the factorization routines, so it will be coded as
             `-1` and not included in `uniques`. By default,
             ``np.nan`` is used.
+
+        Notes
+        -----
+        The values returned by this method are also used in
+        :func:`pandas.util.hash_pandas_object`. If needed, this can be
+        overridden in the ``self._hash_pandas_object()`` method.
         """
         return self.astype(object), np.nan
 
@@ -1449,7 +1454,7 @@ def _hash_pandas_object(
         """
         Hook for hash_pandas_object.
 
-        Default is likely non-performant.
+        Default is to use the values returned by _values_for_factorize.
 
         Parameters
         ----------
@@ -1463,7 +1468,7 @@ def _hash_pandas_object(
         """
         from pandas.core.util.hashing import hash_array
 
-        values = self.to_numpy(copy=False)
+        values, _ = self._values_for_factorize()
         return hash_array(
             values, encoding=encoding, hash_key=hash_key, categorize=categorize
         )
diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
@@ -240,10 +240,6 @@ class TestReduce(base.BaseNoReduceTests):
 
 
 class TestMethods(BaseJSON, base.BaseMethodsTests):
-    @pytest.mark.xfail(reason="ValueError: setting an array element with a sequence")
-    def test_hash_pandas_object(self, data):
-        super().test_hash_pandas_object(data)
-
     @unhashable
     def test_value_counts(self, all_data, dropna):
         super().test_value_counts(all_data, dropna)
@@ -286,10 +282,6 @@ def test_combine_add(self, data_repeated):
     def test_combine_first(self, data):
         super().test_combine_first(data)
 
-    @unhashable
-    def test_hash_pandas_object_works(self, data, kind):
-        super().test_hash_pandas_object_works(data, kind)
-
     @pytest.mark.xfail(reason="broadcasting error")
     def test_where_series(self, data, na_value):
         # Fails with

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@ including other versions of pandas.`
`14`	`14`	`Fixed regressions`
`15`	`15`	`~~~~~~~~~~~~~~~~~`
`16`	`16`	- Fixed performance regression in merging on datetime-like columns (:issue:`53231`)
	`17`	+- For external ExtensionArray implementations, restored the default use of ``_values_for_factorize`` for hashing arrays (:issue:`53475`)
`17`	`18`	`-`
`18`	`19`
`19`	`20`	`.. ---------------------------------------------------------------------------`