Skip to content

Use _values_for_factorize by default for hashing ExtensionArrays #53475

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ including other versions of pandas.
Fixed regressions
~~~~~~~~~~~~~~~~~
- Fixed performance regression in merging on datetime-like columns (:issue:`53231`)
- For external ExtensionArray implementations, restored the default use of ``_values_for_factorize`` for hashing arrays (:issue:`53475`)
-

.. ---------------------------------------------------------------------------
Expand Down
11 changes: 8 additions & 3 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -993,7 +993,6 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
Returns
-------
values : ndarray

An array suitable for factorization. This should maintain order
and be a supported dtype (Float64, Int64, UInt64, String, Object).
By default, the extension array is cast to object dtype.
Expand All @@ -1002,6 +1001,12 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
as NA in the factorization routines, so it will be coded as
`-1` and not included in `uniques`. By default,
``np.nan`` is used.

Notes
-----
The values returned by this method are also used in
:func:`pandas.util.hash_pandas_object`. If needed, this can be
overridden in the ``self._hash_pandas_object()`` method.
"""
return self.astype(object), np.nan

Expand Down Expand Up @@ -1449,7 +1454,7 @@ def _hash_pandas_object(
"""
Hook for hash_pandas_object.

Default is likely non-performant.
Default is to use the values returned by _values_for_factorize.

Parameters
----------
Expand All @@ -1463,7 +1468,7 @@ def _hash_pandas_object(
"""
from pandas.core.util.hashing import hash_array

values = self.to_numpy(copy=False)
values, _ = self._values_for_factorize()
return hash_array(
values, encoding=encoding, hash_key=hash_key, categorize=categorize
)
Expand Down
8 changes: 0 additions & 8 deletions pandas/tests/extension/json/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,10 +240,6 @@ class TestReduce(base.BaseNoReduceTests):


class TestMethods(BaseJSON, base.BaseMethodsTests):
@pytest.mark.xfail(reason="ValueError: setting an array element with a sequence")
def test_hash_pandas_object(self, data):
super().test_hash_pandas_object(data)

@unhashable
def test_value_counts(self, all_data, dropna):
super().test_value_counts(all_data, dropna)
Expand Down Expand Up @@ -286,10 +282,6 @@ def test_combine_add(self, data_repeated):
def test_combine_first(self, data):
super().test_combine_first(data)

@unhashable
def test_hash_pandas_object_works(self, data, kind):
super().test_hash_pandas_object_works(data, kind)

@pytest.mark.xfail(reason="broadcasting error")
def test_where_series(self, data, na_value):
# Fails with
Expand Down