Skip to content

Commit 02adb3d

Browse files
authored
ENH: EA._hash_pandas_object (#51319)
1 parent d89f162 commit 02adb3d

File tree

8 files changed

+106
-64
lines changed

8 files changed

+106
-64
lines changed

ci/code_checks.sh

+1
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
512512
pandas.api.extensions.ExtensionArray._from_factorized \
513513
pandas.api.extensions.ExtensionArray._from_sequence \
514514
pandas.api.extensions.ExtensionArray._from_sequence_of_strings \
515+
pandas.api.extensions.ExtensionArray._hash_pandas_object \
515516
pandas.api.extensions.ExtensionArray._reduce \
516517
pandas.api.extensions.ExtensionArray._values_for_argsort \
517518
pandas.api.extensions.ExtensionArray._values_for_factorize \

doc/source/reference/extensions.rst

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ objects.
3838
api.extensions.ExtensionArray._from_factorized
3939
api.extensions.ExtensionArray._from_sequence
4040
api.extensions.ExtensionArray._from_sequence_of_strings
41+
api.extensions.ExtensionArray._hash_pandas_object
4142
api.extensions.ExtensionArray._reduce
4243
api.extensions.ExtensionArray._values_for_argsort
4344
api.extensions.ExtensionArray._values_for_factorize

pandas/core/arrays/_mixins.py

+10
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,16 @@ def _values_for_argsort(self) -> np.ndarray:
192192
def _values_for_factorize(self):
193193
return self._ndarray, self._internal_fill_value
194194

195+
def _hash_pandas_object(
196+
self, *, encoding: str, hash_key: str, categorize: bool
197+
) -> npt.NDArray[np.uint64]:
198+
from pandas.core.util.hashing import hash_array
199+
200+
values = self._ndarray
201+
return hash_array(
202+
values, encoding=encoding, hash_key=hash_key, categorize=categorize
203+
)
204+
195205
# Signature of "argmin" incompatible with supertype "ExtensionArray"
196206
def argmin(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
197207
# override base class by adding axis keyword

pandas/core/arrays/base.py

+26-5
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ class ExtensionArray:
140140
_from_factorized
141141
_from_sequence
142142
_from_sequence_of_strings
143+
_hash_pandas_object
143144
_reduce
144145
_values_for_argsort
145146
_values_for_factorize
@@ -1002,11 +1003,6 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
10021003
as NA in the factorization routines, so it will be coded as
10031004
`-1` and not included in `uniques`. By default,
10041005
``np.nan`` is used.
1005-
1006-
Notes
1007-
-----
1008-
The values returned by this method are also used in
1009-
:func:`pandas.util.hash_pandas_object`.
10101006
"""
10111007
return self.astype(object), np.nan
10121008

@@ -1452,6 +1448,31 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
14521448
# Non-Optimized Default Methods; in the case of the private methods here,
14531449
# these are not guaranteed to be stable across pandas versions.
14541450

1451+
def _hash_pandas_object(
1452+
self, *, encoding: str, hash_key: str, categorize: bool
1453+
) -> npt.NDArray[np.uint64]:
1454+
"""
1455+
Hook for hash_pandas_object.
1456+
1457+
Default is likely non-performant.
1458+
1459+
Parameters
1460+
----------
1461+
encoding : str
1462+
hash_key : str
1463+
categorize : bool
1464+
1465+
Returns
1466+
-------
1467+
np.ndarray[uint64]
1468+
"""
1469+
from pandas.core.util.hashing import hash_array
1470+
1471+
values = self.to_numpy(copy=False)
1472+
return hash_array(
1473+
values, encoding=encoding, hash_key=hash_key, categorize=categorize
1474+
)
1475+
14551476
def tolist(self) -> list:
14561477
"""
14571478
Return a list of the values.

pandas/core/arrays/categorical.py

+43
Original file line numberDiff line numberDiff line change
@@ -1762,6 +1762,49 @@ def _values_for_rank(self):
17621762
)
17631763
return values
17641764

1765+
def _hash_pandas_object(
1766+
self, *, encoding: str, hash_key: str, categorize: bool
1767+
) -> npt.NDArray[np.uint64]:
1768+
"""
1769+
Hash a Categorical by hashing its categories, and then mapping the codes
1770+
to the hashes.
1771+
1772+
Parameters
1773+
----------
1774+
encoding : str
1775+
hash_key : str
1776+
categorize : bool
1777+
Ignored for Categorical.
1778+
1779+
Returns
1780+
-------
1781+
np.ndarray[uint64]
1782+
"""
1783+
# Note we ignore categorize, as we are already Categorical.
1784+
from pandas.core.util.hashing import hash_array
1785+
1786+
# Convert ExtensionArrays to ndarrays
1787+
values = np.asarray(self.categories._values)
1788+
hashed = hash_array(values, encoding, hash_key, categorize=False)
1789+
1790+
# we have uint64, as we don't directly support missing values
1791+
# we don't want to use take_nd which will coerce to float
1792+
# instead, directly construct the result with a
1793+
# max(np.uint64) as the missing value indicator
1794+
#
1795+
# TODO: GH#15362
1796+
1797+
mask = self.isna()
1798+
if len(hashed):
1799+
result = hashed.take(self._codes)
1800+
else:
1801+
result = np.zeros(len(mask), dtype="uint64")
1802+
1803+
if mask.any():
1804+
result[mask] = lib.u8max
1805+
1806+
return result
1807+
17651808
# ------------------------------------------------------------------
17661809
# NDArrayBackedExtensionArray compat
17671810

pandas/core/util/hashing.py

+10-59
Original file line numberDiff line numberDiff line change
@@ -9,22 +9,17 @@
99
Hashable,
1010
Iterable,
1111
Iterator,
12-
cast,
1312
)
1413

1514
import numpy as np
1615

17-
from pandas._libs import lib
1816
from pandas._libs.hashing import hash_object_array
1917
from pandas._typing import (
2018
ArrayLike,
2119
npt,
2220
)
2321

24-
from pandas.core.dtypes.common import (
25-
is_categorical_dtype,
26-
is_list_like,
27-
)
22+
from pandas.core.dtypes.common import is_list_like
2823
from pandas.core.dtypes.generic import (
2924
ABCDataFrame,
3025
ABCExtensionArray,
@@ -35,7 +30,6 @@
3530

3631
if TYPE_CHECKING:
3732
from pandas import (
38-
Categorical,
3933
DataFrame,
4034
Index,
4135
MultiIndex,
@@ -214,53 +208,14 @@ def hash_tuples(
214208

215209
# hash the list-of-ndarrays
216210
hashes = (
217-
_hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals
211+
cat._hash_pandas_object(encoding=encoding, hash_key=hash_key, categorize=False)
212+
for cat in cat_vals
218213
)
219214
h = combine_hash_arrays(hashes, len(cat_vals))
220215

221216
return h
222217

223218

224-
def _hash_categorical(
225-
cat: Categorical, encoding: str, hash_key: str
226-
) -> npt.NDArray[np.uint64]:
227-
"""
228-
Hash a Categorical by hashing its categories, and then mapping the codes
229-
to the hashes
230-
231-
Parameters
232-
----------
233-
cat : Categorical
234-
encoding : str
235-
hash_key : str
236-
237-
Returns
238-
-------
239-
ndarray[np.uint64] of hashed values, same size as len(c)
240-
"""
241-
# Convert ExtensionArrays to ndarrays
242-
values = np.asarray(cat.categories._values)
243-
hashed = hash_array(values, encoding, hash_key, categorize=False)
244-
245-
# we have uint64, as we don't directly support missing values
246-
# we don't want to use take_nd which will coerce to float
247-
# instead, directly construct the result with a
248-
# max(np.uint64) as the missing value indicator
249-
#
250-
# TODO: GH 15362
251-
252-
mask = cat.isna()
253-
if len(hashed):
254-
result = hashed.take(cat.codes)
255-
else:
256-
result = np.zeros(len(mask), dtype="uint64")
257-
258-
if mask.any():
259-
result[mask] = lib.u8max
260-
261-
return result
262-
263-
264219
def hash_array(
265220
vals: ArrayLike,
266221
encoding: str = "utf8",
@@ -288,17 +243,11 @@ def hash_array(
288243
"""
289244
if not hasattr(vals, "dtype"):
290245
raise TypeError("must pass a ndarray-like")
291-
dtype = vals.dtype
292-
293-
# For categoricals, we hash the categories, then remap the codes to the
294-
# hash values. (This check is above the complex check so that we don't ask
295-
# numpy if categorical is a subdtype of complex, as it will choke).
296-
if is_categorical_dtype(dtype):
297-
vals = cast("Categorical", vals)
298-
return _hash_categorical(vals, encoding, hash_key)
299246

300-
elif isinstance(vals, ABCExtensionArray):
301-
vals, _ = vals._values_for_factorize()
247+
if isinstance(vals, ABCExtensionArray):
248+
return vals._hash_pandas_object(
249+
encoding=encoding, hash_key=hash_key, categorize=categorize
250+
)
302251

303252
elif not isinstance(vals, np.ndarray):
304253
# GH#42003
@@ -347,7 +296,9 @@ def _hash_ndarray(
347296

348297
codes, categories = factorize(vals, sort=False)
349298
cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)
350-
return _hash_categorical(cat, encoding, hash_key)
299+
return cat._hash_pandas_object(
300+
encoding=encoding, hash_key=hash_key, categorize=False
301+
)
351302

352303
try:
353304
vals = hash_object_array(vals, hash_key, encoding)

pandas/tests/extension/base/methods.py

+11
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,17 @@
1818
class BaseMethodsTests(BaseExtensionTests):
1919
"""Various Series and DataFrame methods."""
2020

21+
def test_hash_pandas_object(self, data):
22+
# _hash_pandas_object should return a uint64 ndarray of the same length
23+
# as the data
24+
res = data._hash_pandas_object(
25+
encoding="utf-8",
26+
hash_key=pd.core.util.hashing._default_hash_key,
27+
categorize=False,
28+
)
29+
assert res.dtype == np.uint64
30+
assert res.shape == data.shape
31+
2132
def test_value_counts_default_dropna(self, data):
2233
# make sure we have consistent default dropna kwarg
2334
if not hasattr(data, "value_counts"):

pandas/tests/extension/json/test_json.py

+4
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,10 @@ class TestReduce(base.BaseNoReduceTests):
240240

241241

242242
class TestMethods(BaseJSON, base.BaseMethodsTests):
243+
@pytest.mark.xfail(reason="ValueError: setting an array element with a sequence")
244+
def test_hash_pandas_object(self, data):
245+
super().test_hash_pandas_object(data)
246+
243247
@unhashable
244248
def test_value_counts(self, all_data, dropna):
245249
super().test_value_counts(all_data, dropna)

0 commit comments

Comments
 (0)