Skip to content

Commit e455fed

Browse files
committed
hash function for tuples
1 parent fdbfdec commit e455fed

File tree

1 file changed

+49
-5
lines changed

1 file changed

+49
-5
lines changed

pandas/_libs/src/klib/khash_python.h

+49-5
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
251251
}
252252

253253

254-
Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val){
254+
Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val) {
255255
//Since Python3.10, nan is no longer has hash 0
256256
if (Py_IS_NAN(val)) {
257257
return 0;
@@ -264,13 +264,13 @@ Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val){
264264
}
265265

266266

267-
Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key){
267+
Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key) {
268268
return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key));
269269
}
270270

271271

272272
// replaces _Py_HashDouble with _Pandas_HashDouble
273-
Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key){
273+
Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) {
274274
Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real);
275275
Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag);
276276
if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) {
@@ -284,11 +284,52 @@ Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key){
284284
}
285285

286286

287-
khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){
287+
khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key);
288+
289+
//we could use any hashing algorithm, this is the original CPython's for tuples
290+
291+
#if SIZEOF_PY_UHASH_T > 4
292+
#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL)
293+
#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL)
294+
#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL)
295+
#define _PandasHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */
296+
#else
297+
#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL)
298+
#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL)
299+
#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)374761393UL)
300+
#define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */
301+
#endif
302+
303+
Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) {
304+
Py_ssize_t i, len = Py_SIZE(key);
305+
PyObject **item = key->ob_item;
306+
307+
Py_uhash_t acc = _PandasHASH_XXPRIME_5;
308+
for (i = 0; i < len; i++) {
309+
Py_uhash_t lane = kh_python_hash_func(item[i]);
310+
if (lane == (Py_uhash_t)-1) {
311+
return -1;
312+
}
313+
acc += lane * _PandasHASH_XXPRIME_2;
314+
acc = _PandasHASH_XXROTATE(acc);
315+
acc *= _PandasHASH_XXPRIME_1;
316+
}
317+
318+
/* Add input length, mangled to keep the historical value of hash(()). */
319+
acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL);
320+
321+
if (acc == (Py_uhash_t)-1) {
322+
return 1546275796;
323+
}
324+
return acc;
325+
}
326+
327+
328+
khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) {
288329
Py_hash_t hash;
289330
// For PyObject_Hash holds:
290331
// hash(0.0) == 0 == hash(-0.0)
291-
// yet for different nan-object different hash-values
332+
// yet for different nan-objects different hash-values
292333
// are possible
293334
if (PyFloat_CheckExact(key)) {
294335
// we cannot use kh_float64_hash_func
@@ -302,6 +343,9 @@ khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){
302343
// and kh_complex128_hash_func doesn't respect it
303344
hash = complexobject_hash((PyComplexObject*)key);
304345
}
346+
else if (PyTuple_CheckExact(key)) {
347+
hash = tupleobject_hash((PyTupleObject*)key);
348+
}
305349
else {
306350
hash = PyObject_Hash(key);
307351
}

0 commit comments

Comments
 (0)