Skip to content

BUG: nan-objects lookup fails with Python3.10 #41988

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 97 additions & 4 deletions pandas/_libs/src/klib/khash_python.h
Original file line number Diff line number Diff line change
Expand Up @@ -251,12 +251,105 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
}


khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){
Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val) {
//Since Python3.10, nan is no longer has hash 0
if (Py_IS_NAN(val)) {
return 0;
}
#if PY_VERSION_HEX < 0x030A0000
return _Py_HashDouble(val);
#else
return _Py_HashDouble(NULL, val);
#endif
}


Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key) {
return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key));
}


// replaces _Py_HashDouble with _Pandas_HashDouble
Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) {
Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real);
Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag);
if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) {
return -1;
}
Py_uhash_t combined = realhash + _PyHASH_IMAG * imaghash;
if (combined == (Py_uhash_t)-1) {
return -2;
}
return (Py_hash_t)combined;
}


khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key);

//we could use any hashing algorithm, this is the original CPython's for tuples

#if SIZEOF_PY_UHASH_T > 4
#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL)
#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL)
#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL)
#define _PandasHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */
#else
#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL)
#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL)
#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)374761393UL)
#define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */
#endif

Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) {
Py_ssize_t i, len = Py_SIZE(key);
PyObject **item = key->ob_item;

Py_uhash_t acc = _PandasHASH_XXPRIME_5;
for (i = 0; i < len; i++) {
Py_uhash_t lane = kh_python_hash_func(item[i]);
if (lane == (Py_uhash_t)-1) {
return -1;
}
acc += lane * _PandasHASH_XXPRIME_2;
acc = _PandasHASH_XXROTATE(acc);
acc *= _PandasHASH_XXPRIME_1;
}

/* Add input length, mangled to keep the historical value of hash(()). */
acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL);

if (acc == (Py_uhash_t)-1) {
return 1546275796;
}
return acc;
}


khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) {
Py_hash_t hash;
// For PyObject_Hash holds:
// hash(0.0) == 0 == hash(-0.0)
// hash(X) == 0 if X is a NaN-value
// so it is OK to use it directly for doubles
Py_hash_t hash = PyObject_Hash(key);
// yet for different nan-objects different hash-values
// are possible
if (PyFloat_CheckExact(key)) {
// we cannot use kh_float64_hash_func
// becase float(k) == k holds for any int-object k
// and kh_float64_hash_func doesn't respect it
hash = floatobject_hash((PyFloatObject*)key);
}
else if (PyComplex_CheckExact(key)) {
// we cannot use kh_complex128_hash_func
// becase complex(k,0) == k holds for any int-object k
// and kh_complex128_hash_func doesn't respect it
hash = complexobject_hash((PyComplexObject*)key);
}
else if (PyTuple_CheckExact(key)) {
hash = tupleobject_hash((PyTupleObject*)key);
}
else {
hash = PyObject_Hash(key);
}

if (hash == -1) {
PyErr_Clear();
return 0;
Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/libs/test_hashtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,29 @@ def test_unique(self, table_type, dtype):
assert np.all(np.isnan(unique)) and len(unique) == 1


def test_unique_for_nan_objects_floats():
table = ht.PyObjectHashTable()
keys = np.array([float("nan") for i in range(50)], dtype=np.object_)
unique = table.unique(keys)
assert len(unique) == 1


def test_unique_for_nan_objects_complex():
table = ht.PyObjectHashTable()
keys = np.array([complex(float("nan"), 1.0) for i in range(50)], dtype=np.object_)
unique = table.unique(keys)
assert len(unique) == 1


def test_unique_for_nan_objects_tuple():
table = ht.PyObjectHashTable()
keys = np.array(
[1] + [(1.0, (float("nan"), 1.0)) for i in range(50)], dtype=np.object_
)
unique = table.unique(keys)
assert len(unique) == 2


def get_ht_function(fun_name, type_suffix):
return getattr(ht, fun_name)

Expand Down Expand Up @@ -497,3 +520,11 @@ def test_ismember_tuple_with_nans():
result = isin(values, comps)
expected = np.array([True, False], dtype=np.bool_)
tm.assert_numpy_array_equal(result, expected)


def test_float_complex_int_are_equal_as_objects():
values = ["a", 5, 5.0, 5.0 + 0j]
comps = list(range(129))
result = isin(values, comps)
expected = np.array([False, True, True, True], dtype=np.bool_)
tm.assert_numpy_array_equal(result, expected)