Skip to content

Support ExtensionArray in hash_pandas_object #23082

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 11, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,15 @@ Top-level evaluation

eval

Hashing
~~~~~~~

.. autosummary::
:toctree: generated/

util.hash_array
util.hash_pandas_object

Testing
~~~~~~~

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`).
- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`)
- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`)
- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`)

.. _whatsnew_0240.api.incompatibilities:
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,11 @@ def _values_for_factorize(self):
as NA in the factorization routines, so it will be coded as
`na_sentinal` and not included in `uniques`. By default,
``np.nan`` is used.

Notes
-----
The values returned by this method are also used in
:func:`pandas.util.hash_pandas_object`.
"""
return self.astype(object), np.nan

Expand Down
7 changes: 5 additions & 2 deletions pandas/core/util/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
ABCSeries,
ABCDataFrame)
from pandas.core.dtypes.common import (
is_categorical_dtype, is_list_like)
is_categorical_dtype, is_list_like, is_extension_array_dtype)
from pandas.core.dtypes.missing import isna
from pandas.core.dtypes.cast import infer_dtype_from_scalar

Expand Down Expand Up @@ -265,10 +265,13 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
# numpy if categorical is a subdtype of complex, as it will choke).
if is_categorical_dtype(dtype):
return _hash_categorical(vals, encoding, hash_key)
elif is_extension_array_dtype(dtype):
vals, _ = vals._values_for_factorize()
dtype = vals.dtype

# we'll be working with everything as 64-bit values, so handle this
# 128-bit value early
elif np.issubdtype(dtype, np.complex128):
if np.issubdtype(dtype, np.complex128):
return hash_array(vals.real) + 23 * hash_array(vals.imag)

# First, turn whatever array this is into unsigned 64-bit ints, if we can
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/extension/base/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@


class BaseExtensionTests(object):
assert_equal = staticmethod(tm.assert_equal)
assert_series_equal = staticmethod(tm.assert_series_equal)
assert_frame_equal = staticmethod(tm.assert_frame_equal)
assert_extension_array_equal = staticmethod(
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,13 @@ def test_container_shift(self, data, frame, periods, indices):
compare = self.assert_series_equal

compare(result, expected)

@pytest.mark.parametrize("as_frame", [True, False])
def test_hash_pandas_object_works(self, data, as_frame):
# https://github.com/pandas-dev/pandas/issues/23066
data = pd.Series(data)
if as_frame:
data = data.to_frame()
a = pd.util.hash_pandas_object(data)
b = pd.util.hash_pandas_object(data)
self.assert_equal(a, b)
4 changes: 4 additions & 0 deletions pandas/tests/extension/json/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,10 @@ def test_combine_le(self, data_repeated):
def test_combine_add(self, data_repeated):
pass

@unhashable
def test_hash_pandas_object_works(self, data, kind):
super().test_hash_pandas_object_works(data, kind)


class TestCasting(BaseJSON, base.BaseCastingTests):
@pytest.mark.skip(reason="failing on np.array(self, dtype=str)")
Expand Down