From 089aa0cae6e66814a98749aba6927c9267182457 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 10 Feb 2023 16:09:20 -0800
Subject: [PATCH 1/5] ENH: EA._hash_pandas_object

---
 pandas/core/arrays/_mixins.py            | 10 ++++
 pandas/core/arrays/base.py               | 20 +++++--
 pandas/core/arrays/categorical.py        | 32 +++++++++++
 pandas/core/util/hashing.py              | 69 ++++--------------------
 pandas/tests/extension/base/methods.py   | 11 ++++
 pandas/tests/extension/json/test_json.py |  4 ++
 6 files changed, 82 insertions(+), 64 deletions(-)

diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
index 0a4a550f5d8bc..25ee409eba477 100644
--- a/pandas/core/arrays/_mixins.py
+++ b/pandas/core/arrays/_mixins.py
@@ -192,6 +192,16 @@ def _values_for_argsort(self) -> np.ndarray:
     def _values_for_factorize(self):
         return self._ndarray, self._internal_fill_value
 
+    def _hash_pandas_object(
+        self, *, encoding: str, hash_key: str, categorize: bool
+    ) -> npt.NDArray[np.uint64]:
+        from pandas.core.util.hashing import hash_array
+
+        values = self._ndarray
+        return hash_array(
+            values, encoding=encoding, hash_key=hash_key, categorize=categorize
+        )
+
     # Signature of "argmin" incompatible with supertype "ExtensionArray"
     def argmin(self, axis: AxisInt = 0, skipna: bool = True):  # type: ignore[override]
         # override base class by adding axis keyword
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index c261a41e1e77e..07d9c56db8a6e 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -1005,11 +1005,6 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
             as NA in the factorization routines, so it will be coded as
             `-1` and not included in `uniques`. By default,
             ``np.nan`` is used.
-
-        Notes
-        -----
-        The values returned by this method are also used in
-        :func:`pandas.util.hash_pandas_object`.
         """
         return self.astype(object), np.nan
 
@@ -1455,6 +1450,21 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
     # Non-Optimized Default Methods; in the case of the private methods here,
     #  these are not guaranteed to be stable across pandas versions.
 
+    def _hash_pandas_object(
+        self, *, encoding: str, hash_key: str, categorize: bool
+    ) -> npt.NDArray[np.uint64]:
+        """
+        Hook for hash_pandas_object.
+
+        Default is likely non-performant.
+        """
+        from pandas.core.util.hashing import hash_array
+
+        values = self.to_numpy(copy=False)
+        return hash_array(
+            values, encoding=encoding, hash_key=hash_key, categorize=categorize
+        )
+
     def tolist(self) -> list:
         """
         Return a list of the values.
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index fb953e601735e..15a45f9574fbc 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1762,6 +1762,38 @@ def _values_for_rank(self):
             )
         return values
 
+    def _hash_pandas_object(
+        self, *, encoding: str, hash_key: str, categorize: bool
+    ) -> npt.NDArray[np.uint64]:
+        """
+        Hash a Categorical by hashing its categories, and then mapping the codes
+        to the hashes
+        """
+        # Note we ignore categorize, as we are already Categorical.
+        from pandas.core.util.hashing import hash_array
+
+        # Convert ExtensionArrays to ndarrays
+        values = np.asarray(self.categories._values)
+        hashed = hash_array(values, encoding, hash_key, categorize=False)
+
+        # we have uint64, as we don't directly support missing values
+        # we don't want to use take_nd which will coerce to float
+        # instead, directly construct the result with a
+        # max(np.uint64) as the missing value indicator
+        #
+        # TODO: GH#15362
+
+        mask = self.isna()
+        if len(hashed):
+            result = hashed.take(self._codes)
+        else:
+            result = np.zeros(len(mask), dtype="uint64")
+
+        if mask.any():
+            result[mask] = lib.u8max
+
+        return result
+
     # ------------------------------------------------------------------
     # NDArrayBackedExtensionArray compat
 
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index 350914cc50556..280e9068b9f44 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -9,22 +9,17 @@
     Hashable,
     Iterable,
     Iterator,
-    cast,
 )
 
 import numpy as np
 
-from pandas._libs import lib
 from pandas._libs.hashing import hash_object_array
 from pandas._typing import (
     ArrayLike,
     npt,
 )
 
-from pandas.core.dtypes.common import (
-    is_categorical_dtype,
-    is_list_like,
-)
+from pandas.core.dtypes.common import is_list_like
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
     ABCExtensionArray,
@@ -35,7 +30,6 @@
 
 if TYPE_CHECKING:
     from pandas import (
-        Categorical,
         DataFrame,
         Index,
         MultiIndex,
@@ -214,53 +208,14 @@ def hash_tuples(
 
     # hash the list-of-ndarrays
     hashes = (
-        _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals
+        cat._hash_pandas_object(encoding=encoding, hash_key=hash_key, categorize=False)
+        for cat in cat_vals
     )
     h = combine_hash_arrays(hashes, len(cat_vals))
 
     return h
 
 
-def _hash_categorical(
-    cat: Categorical, encoding: str, hash_key: str
-) -> npt.NDArray[np.uint64]:
-    """
-    Hash a Categorical by hashing its categories, and then mapping the codes
-    to the hashes
-
-    Parameters
-    ----------
-    cat : Categorical
-    encoding : str
-    hash_key : str
-
-    Returns
-    -------
-    ndarray[np.uint64] of hashed values, same size as len(c)
-    """
-    # Convert ExtensionArrays to ndarrays
-    values = np.asarray(cat.categories._values)
-    hashed = hash_array(values, encoding, hash_key, categorize=False)
-
-    # we have uint64, as we don't directly support missing values
-    # we don't want to use take_nd which will coerce to float
-    # instead, directly construct the result with a
-    # max(np.uint64) as the missing value indicator
-    #
-    # TODO: GH 15362
-
-    mask = cat.isna()
-    if len(hashed):
-        result = hashed.take(cat.codes)
-    else:
-        result = np.zeros(len(mask), dtype="uint64")
-
-    if mask.any():
-        result[mask] = lib.u8max
-
-    return result
-
-
 def hash_array(
     vals: ArrayLike,
     encoding: str = "utf8",
@@ -288,17 +243,11 @@ def hash_array(
     """
     if not hasattr(vals, "dtype"):
         raise TypeError("must pass a ndarray-like")
-    dtype = vals.dtype
-
-    # For categoricals, we hash the categories, then remap the codes to the
-    # hash values. (This check is above the complex check so that we don't ask
-    # numpy if categorical is a subdtype of complex, as it will choke).
-    if is_categorical_dtype(dtype):
-        vals = cast("Categorical", vals)
-        return _hash_categorical(vals, encoding, hash_key)
 
-    elif isinstance(vals, ABCExtensionArray):
-        vals, _ = vals._values_for_factorize()
+    if isinstance(vals, ABCExtensionArray):
+        return vals._hash_pandas_object(
+            encoding=encoding, hash_key=hash_key, categorize=categorize
+        )
 
     elif not isinstance(vals, np.ndarray):
         # GH#42003
@@ -347,7 +296,9 @@ def _hash_ndarray(
 
             codes, categories = factorize(vals, sort=False)
             cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)
-            return _hash_categorical(cat, encoding, hash_key)
+            return cat._hash_pandas_object(
+                encoding=encoding, hash_key=hash_key, categorize=False
+            )
 
         try:
             vals = hash_object_array(vals, hash_key, encoding)
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
index b74372017f303..c5414d316b067 100644
--- a/pandas/tests/extension/base/methods.py
+++ b/pandas/tests/extension/base/methods.py
@@ -18,6 +18,17 @@
 class BaseMethodsTests(BaseExtensionTests):
     """Various Series and DataFrame methods."""
 
+    def test_hash_pandas_object(self, data):
+        # _hash_pandas_object should return a uint64 ndarray of the same length
+        # as the data
+        res = data._hash_pandas_object(
+            encoding="utf-8",
+            hash_key=pd.core.util.hashing._default_hash_key,
+            categorize=False,
+        )
+        assert res.dtype == np.uint64
+        assert res.shape == data.shape
+
     def test_value_counts_default_dropna(self, data):
         # make sure we have consistent default dropna kwarg
         if not hasattr(data, "value_counts"):
diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
index 2e5e2fc77d6c4..32c8a73b6a0cb 100644
--- a/pandas/tests/extension/json/test_json.py
+++ b/pandas/tests/extension/json/test_json.py
@@ -240,6 +240,10 @@ class TestReduce(base.BaseNoReduceTests):
 
 
 class TestMethods(BaseJSON, base.BaseMethodsTests):
+    @pytest.mark.xfail(reason="ValueError: setting an array element with a sequence")
+    def test_hash_pandas_object(self, data):
+        super().test_hash_pandas_object(data)
+
     @unhashable
     def test_value_counts(self, all_data, dropna):
         super().test_value_counts(all_data, dropna)

From aad544cb88d957d99d8259530b95dc9e0f2f8bbe Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 27 Feb 2023 14:30:29 -0800
Subject: [PATCH 2/5] update docstring

---
 pandas/core/arrays/base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 92b70626fce0f..4ba0113e833e3 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -140,6 +140,7 @@ class ExtensionArray:
     _from_factorized
     _from_sequence
     _from_sequence_of_strings
+    _hash_pandas_object
     _reduce
     _values_for_argsort
     _values_for_factorize

From 318adcfb1c6e00949f0c7c9ceaa5e11481ea042c Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 28 Feb 2023 08:43:55 -0800
Subject: [PATCH 3/5] troubleshoot docbuild

---
 doc/source/reference/extensions.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
index 595b415ff7342..b33efd388bd60 100644
--- a/doc/source/reference/extensions.rst
+++ b/doc/source/reference/extensions.rst
@@ -38,6 +38,7 @@ objects.
       api.extensions.ExtensionArray._from_factorized
       api.extensions.ExtensionArray._from_sequence
       api.extensions.ExtensionArray._from_sequence_of_strings
+      api.extensions.ExtensionArray._hash_pandas_object
       api.extensions.ExtensionArray._reduce
       api.extensions.ExtensionArray._values_for_argsort
       api.extensions.ExtensionArray._values_for_factorize

From b024c7d93976fd4f1b1c598eaa192e1e3e7df2ed Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 28 Feb 2023 09:30:42 -0800
Subject: [PATCH 4/5] troubleshoot code check build

---
 pandas/core/arrays/base.py        | 10 ++++++++++
 pandas/core/arrays/categorical.py | 13 ++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 4ba0113e833e3..1a082a7579dc3 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -1455,6 +1455,16 @@ def _hash_pandas_object(
         Hook for hash_pandas_object.
 
         Default is likely non-performant.
+
+        Parameters
+        ----------
+        encoding : str
+        hash_key : str
+        categorize : bool
+
+        Returns
+        -------
+        np.ndarray[uint64]
         """
         from pandas.core.util.hashing import hash_array
 
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 80bc061c2f7aa..dd48da9ab6c16 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1767,7 +1767,18 @@ def _hash_pandas_object(
     ) -> npt.NDArray[np.uint64]:
         """
         Hash a Categorical by hashing its categories, and then mapping the codes
-        to the hashes
+        to the hashes.
+
+        Parameters
+        ----------
+        encoding : str
+        hash_key : str
+        categorize : bool
+            Ignored for Categorical.
+
+        Returns
+        -------
+        np.ndarray[uint64]
         """
         # Note we ignore categorize, as we are already Categorical.
         from pandas.core.util.hashing import hash_array

From 9a7d6690a35f256889fe39d3d8b25c52c2e2ac8b Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 28 Feb 2023 15:20:48 -0800
Subject: [PATCH 5/5] ignore in code_checks

---
 ci/code_checks.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 2d5e9cc031158..94710e1918466 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -512,6 +512,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.api.extensions.ExtensionArray._from_factorized \
         pandas.api.extensions.ExtensionArray._from_sequence \
         pandas.api.extensions.ExtensionArray._from_sequence_of_strings \
+        pandas.api.extensions.ExtensionArray._hash_pandas_object \
         pandas.api.extensions.ExtensionArray._reduce \
         pandas.api.extensions.ExtensionArray._values_for_argsort \
         pandas.api.extensions.ExtensionArray._values_for_factorize \