pandas-dev · simonjayhawkins · Jun 22, 2021 · Jun 22, 2021
diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
@@ -228,3 +228,5 @@ def ismember(
     arr: np.ndarray,
     values: np.ndarray,
 ) -> np.ndarray: ...  # np.ndarray[bool]
+def object_hash(obj) -> int: ...
+def objects_are_equal(a, b) -> bool: ...
diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
@@ -34,6 +34,8 @@ from pandas._libs.khash cimport (
     are_equivalent_khcomplex64_t,
     are_equivalent_khcomplex128_t,
     kh_needed_n_buckets,
+    kh_python_hash_equal,
+    kh_python_hash_func,
     kh_str_t,
     khcomplex64_t,
     khcomplex128_t,
@@ -46,6 +48,14 @@ def get_hashtable_trace_domain():
     return KHASH_TRACE_DOMAIN
 
 
+def object_hash(obj):
+    return kh_python_hash_func(obj)
+
+
+def objects_are_equal(a, b):
+    return kh_python_hash_equal(a, b)
+
+
 cdef int64_t NPY_NAT = util.get_nat()
 SIZE_HINT_LIMIT = (1 << 20) + 7
 

diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd
@@ -41,6 +41,9 @@ cdef extern from "khash_python.h":
     bint are_equivalent_float32_t \
     "kh_floats_hash_equal" (float32_t a, float32_t b) nogil
 
+    uint32_t kh_python_hash_func(object key)
+    bint kh_python_hash_equal(object a, object b)
+
     ctypedef struct kh_pymap_t:
         khuint_t n_buckets, size, n_occupied, upper_bound
         uint32_t *flags

diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h
@@ -287,7 +287,7 @@ Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) {
 }
 
 
-khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key);
+khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key);
 
 //we could use any hashing algorithm, this is the original CPython's for tuples
 
@@ -328,7 +328,7 @@ Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) {
 }
 
 
-khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) {
+khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) {
     Py_hash_t hash;
     // For PyObject_Hash holds:
     //    hash(0.0) == 0 == hash(-0.0)

diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -12,6 +12,7 @@
 
 import numpy as np
 
+from pandas._libs.hashtable import object_hash
 from pandas._typing import (
     DtypeObj,
     type_t,
@@ -128,7 +129,9 @@ def __eq__(self, other: Any) -> bool:
         return False
 
     def __hash__(self) -> int:
-        return hash(tuple(getattr(self, attr) for attr in self._metadata))
+        # for python>=3.10, different nan objects have different hashes
+        # we need  to avoid that und thus use hash function with old behavior
+        return object_hash(tuple(getattr(self, attr) for attr in self._metadata))
 
     def __ne__(self, other: Any) -> bool:
         return not self.__eq__(other)

diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py
@@ -240,6 +240,13 @@ def test_nan_in_nested_tuple(self):
         assert str(error.value) == str(other)
 
 
+def test_hash_equal_tuple_with_nans():
+    a = (float("nan"), (float("nan"), float("nan")))
+    b = (float("nan"), (float("nan"), float("nan")))
+    assert ht.object_hash(a) == ht.object_hash(b)
+    assert ht.objects_are_equal(a, b)
+
+
 def test_get_labels_groupby_for_Int64(writable):
     table = ht.Int64HashTable()
     vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64)