From 089aa0cae6e66814a98749aba6927c9267182457 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 10 Feb 2023 16:09:20 -0800 Subject: [PATCH 1/5] ENH: EA._hash_pandas_object --- pandas/core/arrays/_mixins.py | 10 ++++ pandas/core/arrays/base.py | 20 +++++-- pandas/core/arrays/categorical.py | 32 +++++++++++ pandas/core/util/hashing.py | 69 ++++-------------------- pandas/tests/extension/base/methods.py | 11 ++++ pandas/tests/extension/json/test_json.py | 4 ++ 6 files changed, 82 insertions(+), 64 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 0a4a550f5d8bc..25ee409eba477 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -192,6 +192,16 @@ def _values_for_argsort(self) -> np.ndarray: def _values_for_factorize(self): return self._ndarray, self._internal_fill_value + def _hash_pandas_object( + self, *, encoding: str, hash_key: str, categorize: bool + ) -> npt.NDArray[np.uint64]: + from pandas.core.util.hashing import hash_array + + values = self._ndarray + return hash_array( + values, encoding=encoding, hash_key=hash_key, categorize=categorize + ) + # Signature of "argmin" incompatible with supertype "ExtensionArray" def argmin(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override] # override base class by adding axis keyword diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c261a41e1e77e..07d9c56db8a6e 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1005,11 +1005,6 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]: as NA in the factorization routines, so it will be coded as `-1` and not included in `uniques`. By default, ``np.nan`` is used. - - Notes - ----- - The values returned by this method are also used in - :func:`pandas.util.hash_pandas_object`. """ return self.astype(object), np.nan @@ -1455,6 +1450,21 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): # Non-Optimized Default Methods; in the case of the private methods here, # these are not guaranteed to be stable across pandas versions. + def _hash_pandas_object( + self, *, encoding: str, hash_key: str, categorize: bool + ) -> npt.NDArray[np.uint64]: + """ + Hook for hash_pandas_object. + + Default is likely non-performant. + """ + from pandas.core.util.hashing import hash_array + + values = self.to_numpy(copy=False) + return hash_array( + values, encoding=encoding, hash_key=hash_key, categorize=categorize + ) + def tolist(self) -> list: """ Return a list of the values. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index fb953e601735e..15a45f9574fbc 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1762,6 +1762,38 @@ def _values_for_rank(self): ) return values + def _hash_pandas_object( + self, *, encoding: str, hash_key: str, categorize: bool + ) -> npt.NDArray[np.uint64]: + """ + Hash a Categorical by hashing its categories, and then mapping the codes + to the hashes + """ + # Note we ignore categorize, as we are already Categorical. + from pandas.core.util.hashing import hash_array + + # Convert ExtensionArrays to ndarrays + values = np.asarray(self.categories._values) + hashed = hash_array(values, encoding, hash_key, categorize=False) + + # we have uint64, as we don't directly support missing values + # we don't want to use take_nd which will coerce to float + # instead, directly construct the result with a + # max(np.uint64) as the missing value indicator + # + # TODO: GH#15362 + + mask = self.isna() + if len(hashed): + result = hashed.take(self._codes) + else: + result = np.zeros(len(mask), dtype="uint64") + + if mask.any(): + result[mask] = lib.u8max + + return result + # ------------------------------------------------------------------ # NDArrayBackedExtensionArray compat diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 350914cc50556..280e9068b9f44 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -9,22 +9,17 @@ Hashable, Iterable, Iterator, - cast, ) import numpy as np -from pandas._libs import lib from pandas._libs.hashing import hash_object_array from pandas._typing import ( ArrayLike, npt, ) -from pandas.core.dtypes.common import ( - is_categorical_dtype, - is_list_like, -) +from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ( ABCDataFrame, ABCExtensionArray, @@ -35,7 +30,6 @@ if TYPE_CHECKING: from pandas import ( - Categorical, DataFrame, Index, MultiIndex, @@ -214,53 +208,14 @@ def hash_tuples( # hash the list-of-ndarrays hashes = ( - _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals + cat._hash_pandas_object(encoding=encoding, hash_key=hash_key, categorize=False) + for cat in cat_vals ) h = combine_hash_arrays(hashes, len(cat_vals)) return h -def _hash_categorical( - cat: Categorical, encoding: str, hash_key: str -) -> npt.NDArray[np.uint64]: - """ - Hash a Categorical by hashing its categories, and then mapping the codes - to the hashes - - Parameters - ---------- - cat : Categorical - encoding : str - hash_key : str - - Returns - ------- - ndarray[np.uint64] of hashed values, same size as len(c) - """ - # Convert ExtensionArrays to ndarrays - values = np.asarray(cat.categories._values) - hashed = hash_array(values, encoding, hash_key, categorize=False) - - # we have uint64, as we don't directly support missing values - # we don't want to use take_nd which will coerce to float - # instead, directly construct the result with a - # max(np.uint64) as the missing value indicator - # - # TODO: GH 15362 - - mask = cat.isna() - if len(hashed): - result = hashed.take(cat.codes) - else: - result = np.zeros(len(mask), dtype="uint64") - - if mask.any(): - result[mask] = lib.u8max - - return result - - def hash_array( vals: ArrayLike, encoding: str = "utf8", @@ -288,17 +243,11 @@ def hash_array( """ if not hasattr(vals, "dtype"): raise TypeError("must pass a ndarray-like") - dtype = vals.dtype - - # For categoricals, we hash the categories, then remap the codes to the - # hash values. (This check is above the complex check so that we don't ask - # numpy if categorical is a subdtype of complex, as it will choke). - if is_categorical_dtype(dtype): - vals = cast("Categorical", vals) - return _hash_categorical(vals, encoding, hash_key) - elif isinstance(vals, ABCExtensionArray): - vals, _ = vals._values_for_factorize() + if isinstance(vals, ABCExtensionArray): + return vals._hash_pandas_object( + encoding=encoding, hash_key=hash_key, categorize=categorize + ) elif not isinstance(vals, np.ndarray): # GH#42003 @@ -347,7 +296,9 @@ def _hash_ndarray( codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) - return _hash_categorical(cat, encoding, hash_key) + return cat._hash_pandas_object( + encoding=encoding, hash_key=hash_key, categorize=False + ) try: vals = hash_object_array(vals, hash_key, encoding) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index b74372017f303..c5414d316b067 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -18,6 +18,17 @@ class BaseMethodsTests(BaseExtensionTests): """Various Series and DataFrame methods.""" + def test_hash_pandas_object(self, data): + # _hash_pandas_object should return a uint64 ndarray of the same length + # as the data + res = data._hash_pandas_object( + encoding="utf-8", + hash_key=pd.core.util.hashing._default_hash_key, + categorize=False, + ) + assert res.dtype == np.uint64 + assert res.shape == data.shape + def test_value_counts_default_dropna(self, data): # make sure we have consistent default dropna kwarg if not hasattr(data, "value_counts"): diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 2e5e2fc77d6c4..32c8a73b6a0cb 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -240,6 +240,10 @@ class TestReduce(base.BaseNoReduceTests): class TestMethods(BaseJSON, base.BaseMethodsTests): + @pytest.mark.xfail(reason="ValueError: setting an array element with a sequence") + def test_hash_pandas_object(self, data): + super().test_hash_pandas_object(data) + @unhashable def test_value_counts(self, all_data, dropna): super().test_value_counts(all_data, dropna) From aad544cb88d957d99d8259530b95dc9e0f2f8bbe Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 27 Feb 2023 14:30:29 -0800 Subject: [PATCH 2/5] update docstring --- pandas/core/arrays/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 92b70626fce0f..4ba0113e833e3 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -140,6 +140,7 @@ class ExtensionArray: _from_factorized _from_sequence _from_sequence_of_strings + _hash_pandas_object _reduce _values_for_argsort _values_for_factorize From 318adcfb1c6e00949f0c7c9ceaa5e11481ea042c Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 28 Feb 2023 08:43:55 -0800 Subject: [PATCH 3/5] troubleshoot docbuild --- doc/source/reference/extensions.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 595b415ff7342..b33efd388bd60 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -38,6 +38,7 @@ objects. api.extensions.ExtensionArray._from_factorized api.extensions.ExtensionArray._from_sequence api.extensions.ExtensionArray._from_sequence_of_strings + api.extensions.ExtensionArray._hash_pandas_object api.extensions.ExtensionArray._reduce api.extensions.ExtensionArray._values_for_argsort api.extensions.ExtensionArray._values_for_factorize From b024c7d93976fd4f1b1c598eaa192e1e3e7df2ed Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 28 Feb 2023 09:30:42 -0800 Subject: [PATCH 4/5] troubleshoot code check build --- pandas/core/arrays/base.py | 10 ++++++++++ pandas/core/arrays/categorical.py | 13 ++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 4ba0113e833e3..1a082a7579dc3 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1455,6 +1455,16 @@ def _hash_pandas_object( Hook for hash_pandas_object. Default is likely non-performant. + + Parameters + ---------- + encoding : str + hash_key : str + categorize : bool + + Returns + ------- + np.ndarray[uint64] """ from pandas.core.util.hashing import hash_array diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 80bc061c2f7aa..dd48da9ab6c16 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1767,7 +1767,18 @@ def _hash_pandas_object( ) -> npt.NDArray[np.uint64]: """ Hash a Categorical by hashing its categories, and then mapping the codes - to the hashes + to the hashes. + + Parameters + ---------- + encoding : str + hash_key : str + categorize : bool + Ignored for Categorical. + + Returns + ------- + np.ndarray[uint64] """ # Note we ignore categorize, as we are already Categorical. from pandas.core.util.hashing import hash_array From 9a7d6690a35f256889fe39d3d8b25c52c2e2ac8b Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 28 Feb 2023 15:20:48 -0800 Subject: [PATCH 5/5] ignore in code_checks --- ci/code_checks.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2d5e9cc031158..94710e1918466 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -512,6 +512,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.api.extensions.ExtensionArray._from_factorized \ pandas.api.extensions.ExtensionArray._from_sequence \ pandas.api.extensions.ExtensionArray._from_sequence_of_strings \ + pandas.api.extensions.ExtensionArray._hash_pandas_object \ pandas.api.extensions.ExtensionArray._reduce \ pandas.api.extensions.ExtensionArray._values_for_argsort \ pandas.api.extensions.ExtensionArray._values_for_factorize \