diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 5a1b98b190dbc..951703e04d5a3 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -228,3 +228,5 @@ def ismember( arr: np.ndarray, values: np.ndarray, ) -> np.ndarray: ... # np.ndarray[bool] +def object_hash(obj) -> int: ... +def objects_are_equal(a, b) -> bool: ... diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 7df3f69337643..132435701bddb 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -34,6 +34,8 @@ from pandas._libs.khash cimport ( are_equivalent_khcomplex64_t, are_equivalent_khcomplex128_t, kh_needed_n_buckets, + kh_python_hash_equal, + kh_python_hash_func, kh_str_t, khcomplex64_t, khcomplex128_t, @@ -46,6 +48,14 @@ def get_hashtable_trace_domain(): return KHASH_TRACE_DOMAIN +def object_hash(obj): + return kh_python_hash_func(obj) + + +def objects_are_equal(a, b): + return kh_python_hash_equal(a, b) + + cdef int64_t NPY_NAT = util.get_nat() SIZE_HINT_LIMIT = (1 << 20) + 7 diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index ba805e9ff1251..b9c18d6c86039 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -41,6 +41,9 @@ cdef extern from "khash_python.h": bint are_equivalent_float32_t \ "kh_floats_hash_equal" (float32_t a, float32_t b) nogil + uint32_t kh_python_hash_func(object key) + bint kh_python_hash_equal(object a, object b) + ctypedef struct kh_pymap_t: khuint_t n_buckets, size, n_occupied, upper_bound uint32_t *flags diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 4d7875be7e5fd..04a6bf48c50c2 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -287,7 +287,7 @@ Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) { } -khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key); +khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key); //we could use any hashing algorithm, this is the original CPython's for tuples @@ -328,7 +328,7 @@ Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) { } -khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) { +khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) { Py_hash_t hash; // For PyObject_Hash holds: // hash(0.0) == 0 == hash(-0.0) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index e52b318c0b4f7..5b7dadac5d914 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -12,6 +12,7 @@ import numpy as np +from pandas._libs.hashtable import object_hash from pandas._typing import ( DtypeObj, type_t, @@ -128,7 +129,9 @@ def __eq__(self, other: Any) -> bool: return False def __hash__(self) -> int: - return hash(tuple(getattr(self, attr) for attr in self._metadata)) + # for python>=3.10, different nan objects have different hashes + # we need to avoid that und thus use hash function with old behavior + return object_hash(tuple(getattr(self, attr) for attr in self._metadata)) def __ne__(self, other: Any) -> bool: return not self.__eq__(other) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index a1a43fa6ef300..08bfc74e0ef8d 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -240,6 +240,13 @@ def test_nan_in_nested_tuple(self): assert str(error.value) == str(other) +def test_hash_equal_tuple_with_nans(): + a = (float("nan"), (float("nan"), float("nan"))) + b = (float("nan"), (float("nan"), float("nan"))) + assert ht.object_hash(a) == ht.object_hash(b) + assert ht.objects_are_equal(a, b) + + def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64)