From 5935febecf5f25113ade5e3c40768978c9d432ee Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 10 Oct 2018 21:02:14 -0500 Subject: [PATCH] Support ExtensionArray in hash_pandas_object Closes #23066 --- doc/source/api.rst | 9 +++++++++ doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/arrays/base.py | 5 +++++ pandas/core/util/hashing.py | 7 +++++-- pandas/tests/extension/base/base.py | 1 + pandas/tests/extension/base/methods.py | 10 ++++++++++ pandas/tests/extension/json/test_json.py | 4 ++++ 7 files changed, 35 insertions(+), 2 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index ffa240febf731..f57531fffaaaa 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -245,6 +245,15 @@ Top-level evaluation eval +Hashing +~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + util.hash_array + util.hash_pandas_object + Testing ~~~~~~~ diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 40dd48880e0eb..5e9ce875dddb8 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -548,6 +548,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). - Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) +- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`) - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) .. _whatsnew_0240.api.incompatibilities: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index efe587c6aaaad..627afd1b6f860 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -466,6 +466,11 @@ def _values_for_factorize(self): as NA in the factorization routines, so it will be coded as `na_sentinal` and not included in `uniques`. By default, ``np.nan`` is used. + + Notes + ----- + The values returned by this method are also used in + :func:`pandas.util.hash_pandas_object`. """ return self.astype(object), np.nan diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index e62d70847437c..e41885d525653 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -11,7 +11,7 @@ ABCSeries, ABCDataFrame) from pandas.core.dtypes.common import ( - is_categorical_dtype, is_list_like) + is_categorical_dtype, is_list_like, is_extension_array_dtype) from pandas.core.dtypes.missing import isna from pandas.core.dtypes.cast import infer_dtype_from_scalar @@ -265,10 +265,13 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) + elif is_extension_array_dtype(dtype): + vals, _ = vals._values_for_factorize() + dtype = vals.dtype # we'll be working with everything as 64-bit values, so handle this # 128-bit value early - elif np.issubdtype(dtype, np.complex128): + if np.issubdtype(dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can diff --git a/pandas/tests/extension/base/base.py b/pandas/tests/extension/base/base.py index beb7948f2c14b..2a4a1b9c4668b 100644 --- a/pandas/tests/extension/base/base.py +++ b/pandas/tests/extension/base/base.py @@ -2,6 +2,7 @@ class BaseExtensionTests(object): + assert_equal = staticmethod(tm.assert_equal) assert_series_equal = staticmethod(tm.assert_series_equal) assert_frame_equal = staticmethod(tm.assert_frame_equal) assert_extension_array_equal = staticmethod( diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 4e7886dd2e943..dce91d5a9ca9c 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -164,3 +164,13 @@ def test_container_shift(self, data, frame, periods, indices): compare = self.assert_series_equal compare(result, expected) + + @pytest.mark.parametrize("as_frame", [True, False]) + def test_hash_pandas_object_works(self, data, as_frame): + # https://github.com/pandas-dev/pandas/issues/23066 + data = pd.Series(data) + if as_frame: + data = data.to_frame() + a = pd.util.hash_pandas_object(data) + b = pd.util.hash_pandas_object(data) + self.assert_equal(a, b) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 115afdcc99f2b..6c8b12ed865fc 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -199,6 +199,10 @@ def test_combine_le(self, data_repeated): def test_combine_add(self, data_repeated): pass + @unhashable + def test_hash_pandas_object_works(self, data, kind): + super().test_hash_pandas_object_works(data, kind) + class TestCasting(BaseJSON, base.BaseCastingTests): @pytest.mark.skip(reason="failing on np.array(self, dtype=str)")