Skip to content

Commit 41e971e

Browse files
realeadJulianWgs
authored andcommitted
BUG: nan-objects lookup fails with Python3.10 (pandas-dev#41988)
1 parent 1832373 commit 41e971e

File tree

2 files changed

+128
-4
lines changed

2 files changed

+128
-4
lines changed

pandas/_libs/src/klib/khash_python.h

+97-4
Original file line numberDiff line numberDiff line change
@@ -251,12 +251,105 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
251251
}
252252

253253

254-
khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){
254+
Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val) {
255+
//Since Python3.10, nan is no longer has hash 0
256+
if (Py_IS_NAN(val)) {
257+
return 0;
258+
}
259+
#if PY_VERSION_HEX < 0x030A0000
260+
return _Py_HashDouble(val);
261+
#else
262+
return _Py_HashDouble(NULL, val);
263+
#endif
264+
}
265+
266+
267+
Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key) {
268+
return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key));
269+
}
270+
271+
272+
// replaces _Py_HashDouble with _Pandas_HashDouble
273+
Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) {
274+
Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real);
275+
Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag);
276+
if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) {
277+
return -1;
278+
}
279+
Py_uhash_t combined = realhash + _PyHASH_IMAG * imaghash;
280+
if (combined == (Py_uhash_t)-1) {
281+
return -2;
282+
}
283+
return (Py_hash_t)combined;
284+
}
285+
286+
287+
khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key);
288+
289+
//we could use any hashing algorithm, this is the original CPython's for tuples
290+
291+
#if SIZEOF_PY_UHASH_T > 4
292+
#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL)
293+
#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL)
294+
#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL)
295+
#define _PandasHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */
296+
#else
297+
#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL)
298+
#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL)
299+
#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)374761393UL)
300+
#define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */
301+
#endif
302+
303+
Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) {
304+
Py_ssize_t i, len = Py_SIZE(key);
305+
PyObject **item = key->ob_item;
306+
307+
Py_uhash_t acc = _PandasHASH_XXPRIME_5;
308+
for (i = 0; i < len; i++) {
309+
Py_uhash_t lane = kh_python_hash_func(item[i]);
310+
if (lane == (Py_uhash_t)-1) {
311+
return -1;
312+
}
313+
acc += lane * _PandasHASH_XXPRIME_2;
314+
acc = _PandasHASH_XXROTATE(acc);
315+
acc *= _PandasHASH_XXPRIME_1;
316+
}
317+
318+
/* Add input length, mangled to keep the historical value of hash(()). */
319+
acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL);
320+
321+
if (acc == (Py_uhash_t)-1) {
322+
return 1546275796;
323+
}
324+
return acc;
325+
}
326+
327+
328+
khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) {
329+
Py_hash_t hash;
255330
// For PyObject_Hash holds:
256331
// hash(0.0) == 0 == hash(-0.0)
257-
// hash(X) == 0 if X is a NaN-value
258-
// so it is OK to use it directly for doubles
259-
Py_hash_t hash = PyObject_Hash(key);
332+
// yet for different nan-objects different hash-values
333+
// are possible
334+
if (PyFloat_CheckExact(key)) {
335+
// we cannot use kh_float64_hash_func
336+
// becase float(k) == k holds for any int-object k
337+
// and kh_float64_hash_func doesn't respect it
338+
hash = floatobject_hash((PyFloatObject*)key);
339+
}
340+
else if (PyComplex_CheckExact(key)) {
341+
// we cannot use kh_complex128_hash_func
342+
// becase complex(k,0) == k holds for any int-object k
343+
// and kh_complex128_hash_func doesn't respect it
344+
hash = complexobject_hash((PyComplexObject*)key);
345+
}
346+
else if (PyTuple_CheckExact(key)) {
347+
hash = tupleobject_hash((PyTupleObject*)key);
348+
}
349+
else {
350+
hash = PyObject_Hash(key);
351+
}
352+
260353
if (hash == -1) {
261354
PyErr_Clear();
262355
return 0;

pandas/tests/libs/test_hashtable.py

+31
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,29 @@ def test_unique(self, table_type, dtype):
339339
assert np.all(np.isnan(unique)) and len(unique) == 1
340340

341341

342+
def test_unique_for_nan_objects_floats():
343+
table = ht.PyObjectHashTable()
344+
keys = np.array([float("nan") for i in range(50)], dtype=np.object_)
345+
unique = table.unique(keys)
346+
assert len(unique) == 1
347+
348+
349+
def test_unique_for_nan_objects_complex():
350+
table = ht.PyObjectHashTable()
351+
keys = np.array([complex(float("nan"), 1.0) for i in range(50)], dtype=np.object_)
352+
unique = table.unique(keys)
353+
assert len(unique) == 1
354+
355+
356+
def test_unique_for_nan_objects_tuple():
357+
table = ht.PyObjectHashTable()
358+
keys = np.array(
359+
[1] + [(1.0, (float("nan"), 1.0)) for i in range(50)], dtype=np.object_
360+
)
361+
unique = table.unique(keys)
362+
assert len(unique) == 2
363+
364+
342365
def get_ht_function(fun_name, type_suffix):
343366
return getattr(ht, fun_name)
344367

@@ -497,3 +520,11 @@ def test_ismember_tuple_with_nans():
497520
result = isin(values, comps)
498521
expected = np.array([True, False], dtype=np.bool_)
499522
tm.assert_numpy_array_equal(result, expected)
523+
524+
525+
def test_float_complex_int_are_equal_as_objects():
526+
values = ["a", 5, 5.0, 5.0 + 0j]
527+
comps = list(range(129))
528+
result = isin(values, comps)
529+
expected = np.array([False, True, True, True], dtype=np.bool_)
530+
tm.assert_numpy_array_equal(result, expected)

0 commit comments

Comments
 (0)