From b2ecad5c40fbddfcd6e158513a0e74450f94c7a6 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 18 Jun 2021 22:43:56 +0200 Subject: [PATCH 1/4] fix signess (should be unsigned) of the return type for hash, was wrong defined in #39592 --- pandas/_libs/src/klib/khash_python.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index c8e1ca5ebb4d3..83b1aad38b46d 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -284,7 +284,7 @@ Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) { } -khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key); +khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key); //we could use any hashing algorithm, this is the original CPython's for tuples @@ -325,7 +325,7 @@ Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) { } -khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) { +khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) { Py_hash_t hash; // For PyObject_Hash holds: // hash(0.0) == 0 == hash(-0.0) From 861c2115094b808bf3bc687096c171ef1df370e0 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 18 Jun 2021 22:49:32 +0200 Subject: [PATCH 2/4] make hash and equal functions accessible from pure python --- pandas/_libs/hashtable.pyx | 10 ++++++++++ pandas/_libs/khash.pxd | 3 +++ pandas/tests/libs/test_hashtable.py | 7 +++++++ 3 files changed, 20 insertions(+) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 7df3f69337643..132435701bddb 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -34,6 +34,8 @@ from pandas._libs.khash cimport ( are_equivalent_khcomplex64_t, are_equivalent_khcomplex128_t, kh_needed_n_buckets, + kh_python_hash_equal, + kh_python_hash_func, kh_str_t, khcomplex64_t, khcomplex128_t, @@ -46,6 +48,14 @@ def get_hashtable_trace_domain(): return KHASH_TRACE_DOMAIN +def object_hash(obj): + return kh_python_hash_func(obj) + + +def objects_are_equal(a, b): + return kh_python_hash_equal(a, b) + + cdef int64_t NPY_NAT = util.get_nat() SIZE_HINT_LIMIT = (1 << 20) + 7 diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index ba805e9ff1251..b9c18d6c86039 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -41,6 +41,9 @@ cdef extern from "khash_python.h": bint are_equivalent_float32_t \ "kh_floats_hash_equal" (float32_t a, float32_t b) nogil + uint32_t kh_python_hash_func(object key) + bint kh_python_hash_equal(object a, object b) + ctypedef struct kh_pymap_t: khuint_t n_buckets, size, n_occupied, upper_bound uint32_t *flags diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index a1a43fa6ef300..08bfc74e0ef8d 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -240,6 +240,13 @@ def test_nan_in_nested_tuple(self): assert str(error.value) == str(other) +def test_hash_equal_tuple_with_nans(): + a = (float("nan"), (float("nan"), float("nan"))) + b = (float("nan"), (float("nan"), float("nan"))) + assert ht.object_hash(a) == ht.object_hash(b) + assert ht.objects_are_equal(a, b) + + def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64) From 9770e4efaa4c4d17b7a30020ce58c3bae8f884f8 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sat, 19 Jun 2021 21:55:28 +0200 Subject: [PATCH 3/4] do not use hash-function because for python>=3.10 it no longer has the desired behavior for nans --- pandas/core/dtypes/base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index e52b318c0b4f7..5b7dadac5d914 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -12,6 +12,7 @@ import numpy as np +from pandas._libs.hashtable import object_hash from pandas._typing import ( DtypeObj, type_t, @@ -128,7 +129,9 @@ def __eq__(self, other: Any) -> bool: return False def __hash__(self) -> int: - return hash(tuple(getattr(self, attr) for attr in self._metadata)) + # for python>=3.10, different nan objects have different hashes + # we need to avoid that und thus use hash function with old behavior + return object_hash(tuple(getattr(self, attr) for attr in self._metadata)) def __ne__(self, other: Any) -> bool: return not self.__eq__(other) From 5fd63669e930fd72225a4fb135c4dceadd936a95 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Mon, 21 Jun 2021 14:58:11 +0200 Subject: [PATCH 4/4] adding function declarations to hashtable.pyi --- pandas/_libs/hashtable.pyi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 5a1b98b190dbc..951703e04d5a3 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -228,3 +228,5 @@ def ismember( arr: np.ndarray, values: np.ndarray, ) -> np.ndarray: ... # np.ndarray[bool] +def object_hash(obj) -> int: ... +def objects_are_equal(a, b) -> bool: ...