ENH: EA._hash_pandas_object (#51319)

jbrockmendel · web-flow · commit 02adb3dca764 · 2023-03-01T17:25:01.000+01:00
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -512,6 +512,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.api.extensions.ExtensionArray._from_factorized \
         pandas.api.extensions.ExtensionArray._from_sequence \
         pandas.api.extensions.ExtensionArray._from_sequence_of_strings \
+        pandas.api.extensions.ExtensionArray._hash_pandas_object \
         pandas.api.extensions.ExtensionArray._reduce \
         pandas.api.extensions.ExtensionArray._values_for_argsort \
         pandas.api.extensions.ExtensionArray._values_for_factorize \
diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
@@ -38,6 +38,7 @@ objects.
       api.extensions.ExtensionArray._from_factorized
       api.extensions.ExtensionArray._from_sequence
       api.extensions.ExtensionArray._from_sequence_of_strings
+      api.extensions.ExtensionArray._hash_pandas_object
       api.extensions.ExtensionArray._reduce
       api.extensions.ExtensionArray._values_for_argsort
       api.extensions.ExtensionArray._values_for_factorize
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
@@ -192,6 +192,16 @@ def _values_for_argsort(self) -> np.ndarray:
     def _values_for_factorize(self):
         return self._ndarray, self._internal_fill_value
 
+    def _hash_pandas_object(
+        self, *, encoding: str, hash_key: str, categorize: bool
+    ) -> npt.NDArray[np.uint64]:
+        from pandas.core.util.hashing import hash_array
+
+        values = self._ndarray
+        return hash_array(
+            values, encoding=encoding, hash_key=hash_key, categorize=categorize
+        )
+
     # Signature of "argmin" incompatible with supertype "ExtensionArray"
     def argmin(self, axis: AxisInt = 0, skipna: bool = True):  # type: ignore[override]
         # override base class by adding axis keyword
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -140,6 +140,7 @@ class ExtensionArray:
     _from_factorized
     _from_sequence
     _from_sequence_of_strings
+    _hash_pandas_object
     _reduce
     _values_for_argsort
     _values_for_factorize
@@ -1002,11 +1003,6 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
             as NA in the factorization routines, so it will be coded as
             `-1` and not included in `uniques`. By default,
             ``np.nan`` is used.
-
-        Notes
-        -----
-        The values returned by this method are also used in
-        :func:`pandas.util.hash_pandas_object`.
         """
         return self.astype(object), np.nan
 
@@ -1452,6 +1448,31 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
     # Non-Optimized Default Methods; in the case of the private methods here,
     #  these are not guaranteed to be stable across pandas versions.
 
+    def _hash_pandas_object(
+        self, *, encoding: str, hash_key: str, categorize: bool
+    ) -> npt.NDArray[np.uint64]:
+        """
+        Hook for hash_pandas_object.
+
+        Default is likely non-performant.
+
+        Parameters
+        ----------
+        encoding : str
+        hash_key : str
+        categorize : bool
+
+        Returns
+        -------
+        np.ndarray[uint64]
+        """
+        from pandas.core.util.hashing import hash_array
+
+        values = self.to_numpy(copy=False)
+        return hash_array(
+            values, encoding=encoding, hash_key=hash_key, categorize=categorize
+        )
+
     def tolist(self) -> list:
         """
         Return a list of the values.
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1762,6 +1762,49 @@ def _values_for_rank(self):
             )
         return values
 
+    def _hash_pandas_object(
+        self, *, encoding: str, hash_key: str, categorize: bool
+    ) -> npt.NDArray[np.uint64]:
+        """
+        Hash a Categorical by hashing its categories, and then mapping the codes
+        to the hashes.
+
+        Parameters
+        ----------
+        encoding : str
+        hash_key : str
+        categorize : bool
+            Ignored for Categorical.
+
+        Returns
+        -------
+        np.ndarray[uint64]
+        """
+        # Note we ignore categorize, as we are already Categorical.
+        from pandas.core.util.hashing import hash_array
+
+        # Convert ExtensionArrays to ndarrays
+        values = np.asarray(self.categories._values)
+        hashed = hash_array(values, encoding, hash_key, categorize=False)
+
+        # we have uint64, as we don't directly support missing values
+        # we don't want to use take_nd which will coerce to float
+        # instead, directly construct the result with a
+        # max(np.uint64) as the missing value indicator
+        #
+        # TODO: GH#15362
+
+        mask = self.isna()
+        if len(hashed):
+            result = hashed.take(self._codes)
+        else:
+            result = np.zeros(len(mask), dtype="uint64")
+
+        if mask.any():
+            result[mask] = lib.u8max
+
+        return result
+
     # ------------------------------------------------------------------
     # NDArrayBackedExtensionArray compat
 
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
@@ -9,22 +9,17 @@
     Hashable,
     Iterable,
     Iterator,
-    cast,
 )
 
 import numpy as np
 
-from pandas._libs import lib
 from pandas._libs.hashing import hash_object_array
 from pandas._typing import (
     ArrayLike,
     npt,
 )
 
-from pandas.core.dtypes.common import (
-    is_categorical_dtype,
-    is_list_like,
-)
+from pandas.core.dtypes.common import is_list_like
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
     ABCExtensionArray,
@@ -35,7 +30,6 @@
 
 if TYPE_CHECKING:
     from pandas import (
-        Categorical,
         DataFrame,
         Index,
         MultiIndex,
@@ -214,53 +208,14 @@ def hash_tuples(
 
     # hash the list-of-ndarrays
     hashes = (
-        _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals
+        cat._hash_pandas_object(encoding=encoding, hash_key=hash_key, categorize=False)
+        for cat in cat_vals
     )
     h = combine_hash_arrays(hashes, len(cat_vals))
 
     return h
 
 
-def _hash_categorical(
-    cat: Categorical, encoding: str, hash_key: str
-) -> npt.NDArray[np.uint64]:
-    """
-    Hash a Categorical by hashing its categories, and then mapping the codes
-    to the hashes
-
-    Parameters
-    ----------
-    cat : Categorical
-    encoding : str
-    hash_key : str
-
-    Returns
-    -------
-    ndarray[np.uint64] of hashed values, same size as len(c)
-    """
-    # Convert ExtensionArrays to ndarrays
-    values = np.asarray(cat.categories._values)
-    hashed = hash_array(values, encoding, hash_key, categorize=False)
-
-    # we have uint64, as we don't directly support missing values
-    # we don't want to use take_nd which will coerce to float
-    # instead, directly construct the result with a
-    # max(np.uint64) as the missing value indicator
-    #
-    # TODO: GH 15362
-
-    mask = cat.isna()
-    if len(hashed):
-        result = hashed.take(cat.codes)
-    else:
-        result = np.zeros(len(mask), dtype="uint64")
-
-    if mask.any():
-        result[mask] = lib.u8max
-
-    return result
-
-
 def hash_array(
     vals: ArrayLike,
     encoding: str = "utf8",
@@ -288,17 +243,11 @@ def hash_array(
     """
     if not hasattr(vals, "dtype"):
         raise TypeError("must pass a ndarray-like")
-    dtype = vals.dtype
-
-    # For categoricals, we hash the categories, then remap the codes to the
-    # hash values. (This check is above the complex check so that we don't ask
-    # numpy if categorical is a subdtype of complex, as it will choke).
-    if is_categorical_dtype(dtype):
-        vals = cast("Categorical", vals)
-        return _hash_categorical(vals, encoding, hash_key)
 
-    elif isinstance(vals, ABCExtensionArray):
-        vals, _ = vals._values_for_factorize()
+    if isinstance(vals, ABCExtensionArray):
+        return vals._hash_pandas_object(
+            encoding=encoding, hash_key=hash_key, categorize=categorize
+        )
 
     elif not isinstance(vals, np.ndarray):
         # GH#42003
@@ -347,7 +296,9 @@ def _hash_ndarray(
 
             codes, categories = factorize(vals, sort=False)
             cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)
-            return _hash_categorical(cat, encoding, hash_key)
+            return cat._hash_pandas_object(
+                encoding=encoding, hash_key=hash_key, categorize=False
+            )
 
         try:
             vals = hash_object_array(vals, hash_key, encoding)
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -18,6 +18,17 @@
 class BaseMethodsTests(BaseExtensionTests):
     """Various Series and DataFrame methods."""
 
+    def test_hash_pandas_object(self, data):
+        # _hash_pandas_object should return a uint64 ndarray of the same length
+        # as the data
+        res = data._hash_pandas_object(
+            encoding="utf-8",
+            hash_key=pd.core.util.hashing._default_hash_key,
+            categorize=False,
+        )
+        assert res.dtype == np.uint64
+        assert res.shape == data.shape
+
     def test_value_counts_default_dropna(self, data):
         # make sure we have consistent default dropna kwarg
         if not hasattr(data, "value_counts"):
diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
@@ -240,6 +240,10 @@ class TestReduce(base.BaseNoReduceTests):
 
 
 class TestMethods(BaseJSON, base.BaseMethodsTests):
+    @pytest.mark.xfail(reason="ValueError: setting an array element with a sequence")
+    def test_hash_pandas_object(self, data):
+        super().test_hash_pandas_object(data)
+
     @unhashable
     def test_value_counts(self, all_data, dropna):
         super().test_value_counts(all_data, dropna)