Skip to content

Commit c4bf1ed

Browse files
committed
taking upper 32bit of PyHash into account as well
1 parent d558bce commit c4bf1ed

File tree

2 files changed

+32
-5
lines changed

2 files changed

+32
-5
lines changed

asv_bench/benchmarks/hash_functions.py

+9
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,15 @@ def time_isin_outside(self, dtype, exponent):
2525
self.s.isin(self.values_outside)
2626

2727

28+
class UniqueForLargePyObjectInts:
29+
def setup(self):
30+
lst = [x << 32 for x in range(5000)]
31+
self.arr = np.array(lst, dtype=np.object_)
32+
33+
def time_unique(self):
34+
pd.unique(self.arr)
35+
36+
2837
class IsinWithRandomFloat:
2938
params = [
3039
[np.float64, np.object],

pandas/_libs/src/klib/khash_python.h

+23-5
Original file line numberDiff line numberDiff line change
@@ -178,11 +178,29 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
178178
return result;
179179
}
180180

181-
// For PyObject_Hash holds:
182-
// hash(0.0) == 0 == hash(-0.0)
183-
// hash(X) == 0 if X is a NaN-value
184-
// so it is OK to use it directly
185-
#define kh_python_hash_func(key) (PyObject_Hash(key))
181+
182+
khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){
183+
// For PyObject_Hash holds:
184+
// hash(0.0) == 0 == hash(-0.0)
185+
// hash(X) == 0 if X is a NaN-value
186+
// so it is OK to use it directly for doubles
187+
Py_hash_t hash = PyObject_Hash(key);
188+
if (hash == -1) {
189+
PyErr_Clear();
190+
return 0;
191+
}
192+
#if SIZEOF_PY_HASH_T == 4
193+
// it is already 32bit value
194+
return hash;
195+
#else
196+
// for 64bit builds,
197+
// we need information of the upper 32bits as well
198+
// see GH 37615
199+
return kh_int64_hash_func(hash);
200+
#endif
201+
}
202+
203+
186204
#define kh_python_hash_equal(a, b) (pyobject_cmp(a, b))
187205

188206

0 commit comments

Comments
 (0)