From 0dee8984000f506f94379b647fa1bf17bc983cf4 Mon Sep 17 00:00:00 2001 From: realead Date: Thu, 17 Jun 2021 21:05:35 +0200 Subject: [PATCH] Backport PR #41988: BUG: nan-objects lookup fails with Python3.10 --- pandas/_libs/src/klib/khash_python.h | 101 +++++++++++++++++++++++++-- pandas/tests/libs/test_hashtable.py | 31 ++++++++ 2 files changed, 128 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 87c6283c19a2f..c8e1ca5ebb4d3 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -251,12 +251,105 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { } -khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){ +Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val) { + //Since Python3.10, nan is no longer has hash 0 + if (Py_IS_NAN(val)) { + return 0; + } +#if PY_VERSION_HEX < 0x030A0000 + return _Py_HashDouble(val); +#else + return _Py_HashDouble(NULL, val); +#endif +} + + +Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key) { + return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key)); +} + + +// replaces _Py_HashDouble with _Pandas_HashDouble +Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) { + Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real); + Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag); + if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) { + return -1; + } + Py_uhash_t combined = realhash + _PyHASH_IMAG * imaghash; + if (combined == (Py_uhash_t)-1) { + return -2; + } + return (Py_hash_t)combined; +} + + +khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key); + +//we could use any hashing algorithm, this is the original CPython's for tuples + +#if SIZEOF_PY_UHASH_T > 4 +#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL) +#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL) +#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL) +#define _PandasHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */ +#else +#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL) +#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL) +#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)374761393UL) +#define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */ +#endif + +Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) { + Py_ssize_t i, len = Py_SIZE(key); + PyObject **item = key->ob_item; + + Py_uhash_t acc = _PandasHASH_XXPRIME_5; + for (i = 0; i < len; i++) { + Py_uhash_t lane = kh_python_hash_func(item[i]); + if (lane == (Py_uhash_t)-1) { + return -1; + } + acc += lane * _PandasHASH_XXPRIME_2; + acc = _PandasHASH_XXROTATE(acc); + acc *= _PandasHASH_XXPRIME_1; + } + + /* Add input length, mangled to keep the historical value of hash(()). */ + acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL); + + if (acc == (Py_uhash_t)-1) { + return 1546275796; + } + return acc; +} + + +khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) { + Py_hash_t hash; // For PyObject_Hash holds: // hash(0.0) == 0 == hash(-0.0) - // hash(X) == 0 if X is a NaN-value - // so it is OK to use it directly for doubles - Py_hash_t hash = PyObject_Hash(key); + // yet for different nan-objects different hash-values + // are possible + if (PyFloat_CheckExact(key)) { + // we cannot use kh_float64_hash_func + // becase float(k) == k holds for any int-object k + // and kh_float64_hash_func doesn't respect it + hash = floatobject_hash((PyFloatObject*)key); + } + else if (PyComplex_CheckExact(key)) { + // we cannot use kh_complex128_hash_func + // becase complex(k,0) == k holds for any int-object k + // and kh_complex128_hash_func doesn't respect it + hash = complexobject_hash((PyComplexObject*)key); + } + else if (PyTuple_CheckExact(key)) { + hash = tupleobject_hash((PyTupleObject*)key); + } + else { + hash = PyObject_Hash(key); + } + if (hash == -1) { PyErr_Clear(); return 0; diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 0edcebdc069f4..a1a43fa6ef300 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -339,6 +339,29 @@ def test_unique(self, table_type, dtype): assert np.all(np.isnan(unique)) and len(unique) == 1 +def test_unique_for_nan_objects_floats(): + table = ht.PyObjectHashTable() + keys = np.array([float("nan") for i in range(50)], dtype=np.object_) + unique = table.unique(keys) + assert len(unique) == 1 + + +def test_unique_for_nan_objects_complex(): + table = ht.PyObjectHashTable() + keys = np.array([complex(float("nan"), 1.0) for i in range(50)], dtype=np.object_) + unique = table.unique(keys) + assert len(unique) == 1 + + +def test_unique_for_nan_objects_tuple(): + table = ht.PyObjectHashTable() + keys = np.array( + [1] + [(1.0, (float("nan"), 1.0)) for i in range(50)], dtype=np.object_ + ) + unique = table.unique(keys) + assert len(unique) == 2 + + def get_ht_function(fun_name, type_suffix): return getattr(ht, fun_name) @@ -497,3 +520,11 @@ def test_ismember_tuple_with_nans(): result = isin(values, comps) expected = np.array([True, False], dtype=np.bool_) tm.assert_numpy_array_equal(result, expected) + + +def test_float_complex_int_are_equal_as_objects(): + values = ["a", 5, 5.0, 5.0 + 0j] + comps = list(range(129)) + result = isin(values, comps) + expected = np.array([False, True, True, True], dtype=np.bool_) + tm.assert_numpy_array_equal(result, expected)