diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py index 5227ad0f53a04..3743882b936e2 100644 --- a/asv_bench/benchmarks/hash_functions.py +++ b/asv_bench/benchmarks/hash_functions.py @@ -25,6 +25,15 @@ def time_isin_outside(self, dtype, exponent): self.s.isin(self.values_outside) +class UniqueForLargePyObjectInts: + def setup(self): + lst = [x << 32 for x in range(5000)] + self.arr = np.array(lst, dtype=np.object_) + + def time_unique(self): + pd.unique(self.arr) + + class IsinWithRandomFloat: params = [ [np.float64, np.object], diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 17d8c79994dbe..d5c0551abbdd7 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -253,6 +253,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`) - Performance improvement in :meth:`core.window.rolling.Rolling.corr` and :meth:`core.window.rolling.Rolling.cov` (:issue:`39388`) - Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:`39591`) +- Performance improvement in :func:`unique` for object data type (:issue:`37615`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 0073aaf0195c7..aee018262e3a6 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -178,11 +178,31 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { return result; } -// For PyObject_Hash holds: -// hash(0.0) == 0 == hash(-0.0) -// hash(X) == 0 if X is a NaN-value -// so it is OK to use it directly -#define kh_python_hash_func(key) (PyObject_Hash(key)) + +khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){ + // For PyObject_Hash holds: + // hash(0.0) == 0 == hash(-0.0) + // hash(X) == 0 if X is a NaN-value + // so it is OK to use it directly for doubles + Py_hash_t hash = PyObject_Hash(key); + if (hash == -1) { + PyErr_Clear(); + return 0; + } + #if SIZEOF_PY_HASH_T == 4 + // it is already 32bit value + return hash; + #else + // for 64bit builds, + // we need information of the upper 32bits as well + // see GH 37615 + khuint64_t as_uint = (khuint64_t) hash; + // uints avoid undefined behavior of signed ints + return (as_uint>>32)^as_uint; + #endif +} + + #define kh_python_hash_equal(a, b) (pyobject_cmp(a, b))