Skip to content

Commit 2a1d2da

Browse files
Use _values_for_factorize by default for hashing ExtensionArrays (#53475)
1 parent 641427e commit 2a1d2da

File tree

3 files changed

+9
-11
lines changed

3 files changed

+9
-11
lines changed

doc/source/whatsnew/v2.0.3.rst

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ including other versions of pandas.
1414
Fixed regressions
1515
~~~~~~~~~~~~~~~~~
1616
- Fixed performance regression in merging on datetime-like columns (:issue:`53231`)
17+
- For external ExtensionArray implementations, restored the default use of ``_values_for_factorize`` for hashing arrays (:issue:`53475`)
1718
-
1819

1920
.. ---------------------------------------------------------------------------

pandas/core/arrays/base.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -993,7 +993,6 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
993993
Returns
994994
-------
995995
values : ndarray
996-
997996
An array suitable for factorization. This should maintain order
998997
and be a supported dtype (Float64, Int64, UInt64, String, Object).
999998
By default, the extension array is cast to object dtype.
@@ -1002,6 +1001,12 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
10021001
as NA in the factorization routines, so it will be coded as
10031002
`-1` and not included in `uniques`. By default,
10041003
``np.nan`` is used.
1004+
1005+
Notes
1006+
-----
1007+
The values returned by this method are also used in
1008+
:func:`pandas.util.hash_pandas_object`. If needed, this can be
1009+
overridden in the ``self._hash_pandas_object()`` method.
10051010
"""
10061011
return self.astype(object), np.nan
10071012

@@ -1449,7 +1454,7 @@ def _hash_pandas_object(
14491454
"""
14501455
Hook for hash_pandas_object.
14511456
1452-
Default is likely non-performant.
1457+
Default is to use the values returned by _values_for_factorize.
14531458
14541459
Parameters
14551460
----------
@@ -1463,7 +1468,7 @@ def _hash_pandas_object(
14631468
"""
14641469
from pandas.core.util.hashing import hash_array
14651470

1466-
values = self.to_numpy(copy=False)
1471+
values, _ = self._values_for_factorize()
14671472
return hash_array(
14681473
values, encoding=encoding, hash_key=hash_key, categorize=categorize
14691474
)

pandas/tests/extension/json/test_json.py

-8
Original file line numberDiff line numberDiff line change
@@ -240,10 +240,6 @@ class TestReduce(base.BaseNoReduceTests):
240240

241241

242242
class TestMethods(BaseJSON, base.BaseMethodsTests):
243-
@pytest.mark.xfail(reason="ValueError: setting an array element with a sequence")
244-
def test_hash_pandas_object(self, data):
245-
super().test_hash_pandas_object(data)
246-
247243
@unhashable
248244
def test_value_counts(self, all_data, dropna):
249245
super().test_value_counts(all_data, dropna)
@@ -286,10 +282,6 @@ def test_combine_add(self, data_repeated):
286282
def test_combine_first(self, data):
287283
super().test_combine_first(data)
288284

289-
@unhashable
290-
def test_hash_pandas_object_works(self, data, kind):
291-
super().test_hash_pandas_object_works(data, kind)
292-
293285
@pytest.mark.xfail(reason="broadcasting error")
294286
def test_where_series(self, data, na_value):
295287
# Fails with

0 commit comments

Comments
 (0)