Skip to content

Commit 1e1aa5c

Browse files
committed
Support ExtensionArray in hash_pandas_object
Closes pandas-dev#23066
1 parent 296c251 commit 1e1aa5c

File tree

7 files changed

+34
-2
lines changed

7 files changed

+34
-2
lines changed

doc/source/api.rst

+9
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,15 @@ Top-level evaluation
245245

246246
eval
247247

248+
Hashing
249+
~~~~~~~
250+
251+
.. autosummary::
252+
:toctree: generated/
253+
254+
util.hash_array
255+
util.hash_pandas_object
256+
248257
Testing
249258
~~~~~~~
250259

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
548548
- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`).
549549
- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
550550
- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`)
551+
- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`)
551552
- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`)
552553

553554
.. _whatsnew_0240.api.incompatibilities:

pandas/core/arrays/base.py

+5
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,11 @@ def _values_for_factorize(self):
466466
as NA in the factorization routines, so it will be coded as
467467
`na_sentinal` and not included in `uniques`. By default,
468468
``np.nan`` is used.
469+
470+
Notes
471+
-----
472+
The values returned by this method are also used in
473+
:func:`pandas.util.hash_pandas_object`.
469474
"""
470475
return self.astype(object), np.nan
471476

pandas/core/util/hashing.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
ABCSeries,
1212
ABCDataFrame)
1313
from pandas.core.dtypes.common import (
14-
is_categorical_dtype, is_list_like)
14+
is_categorical_dtype, is_list_like, is_extension_array_dtype)
1515
from pandas.core.dtypes.missing import isna
1616
from pandas.core.dtypes.cast import infer_dtype_from_scalar
1717

@@ -265,10 +265,13 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
265265
# numpy if categorical is a subdtype of complex, as it will choke).
266266
if is_categorical_dtype(dtype):
267267
return _hash_categorical(vals, encoding, hash_key)
268+
elif is_extension_array_dtype(dtype):
269+
vals, _ = vals._values_for_factorize()
270+
dtype = vals.dtype
268271

269272
# we'll be working with everything as 64-bit values, so handle this
270273
# 128-bit value early
271-
elif np.issubdtype(dtype, np.complex128):
274+
if np.issubdtype(dtype, np.complex128):
272275
return hash_array(vals.real) + 23 * hash_array(vals.imag)
273276

274277
# First, turn whatever array this is into unsigned 64-bit ints, if we can

pandas/tests/extension/base/base.py

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33

44
class BaseExtensionTests(object):
5+
assert_equal = staticmethod(tm.assert_equal)
56
assert_series_equal = staticmethod(tm.assert_series_equal)
67
assert_frame_equal = staticmethod(tm.assert_frame_equal)
78
assert_extension_array_equal = staticmethod(

pandas/tests/extension/base/methods.py

+9
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,12 @@ def test_container_shift(self, data, frame, periods, indices):
164164
compare = self.assert_series_equal
165165

166166
compare(result, expected)
167+
168+
@pytest.mark.parametrize("as_frame", [True, False])
169+
def test_hash_pandas_object_works(self, data, as_frame):
170+
data = pd.Series(data)
171+
if as_frame:
172+
data = data.to_frame()
173+
a = pd.util.hash_pandas_object(data)
174+
b = pd.util.hash_pandas_object(data)
175+
self.assert_equal(a, b)

pandas/tests/extension/json/test_json.py

+4
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,10 @@ def test_combine_le(self, data_repeated):
199199
def test_combine_add(self, data_repeated):
200200
pass
201201

202+
@unhashable
203+
def test_hash_pandas_object_works(self, data, kind):
204+
super().test_hash_pandas_object_works(data, kind)
205+
202206

203207
class TestCasting(BaseJSON, base.BaseCastingTests):
204208
@pytest.mark.skip(reason="failing on np.array(self, dtype=str)")

0 commit comments

Comments
 (0)