From 498a4c50d4c8540507baeb275406ca8b91c30b54 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 13 Oct 2020 21:08:56 +0200 Subject: [PATCH 1/8] adding asv-benchmark hash_functions --- asv_bench/benchmarks/hash_functions.py | 164 +++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 asv_bench/benchmarks/hash_functions.py diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py new file mode 100644 index 0000000000000..17bf434acf38a --- /dev/null +++ b/asv_bench/benchmarks/hash_functions.py @@ -0,0 +1,164 @@ +import numpy as np + +import pandas as pd + + +class IsinAlmostFullWithRandomInt: + params = [ + [np.float64, np.int64, np.uint64, np.object], + range(10, 21), + ] + param_names = ["dtype", "exponent"] + + def setup(self, dtype, exponent): + M = 3 * 2 ** (exponent - 2) + # 0.77-the maximal share of occupied buckets + np.random.seed(42) + self.s = pd.Series(np.random.randint(0, M, M)).astype(dtype) + self.values = np.random.randint(0, M, M).astype(dtype) + self.values_outside = self.values + M + + def time_isin(self, dtype, exponent): + self.s.isin(self.values) + + def time_isin_outside(self, dtype, exponent): + self.s.isin(self.values_outside) + + +class IsinWithRandomFloat: + params = [ + [np.float64, np.object], + [ + 1_300, + 2_000, + 7_000, + 8_000, + 70_000, + 80_000, + 750_000, + 900_000, + ], + ] + param_names = ["dtype", "M"] + + def setup(self, dtype, M): + np.random.seed(42) + self.values = np.random.rand(M) + self.s = pd.Series(self.values).astype(dtype) + np.random.shuffle(self.values) + self.values_outside = self.values + 0.1 + + def time_isin(self, dtype, M): + self.s.isin(self.values) + + def time_isin_outside(self, dtype, M): + self.s.isin(self.values_outside) + + +class IsinWithArangeSorted: + params = [ + [np.float64, np.int64, np.uint64, np.object], + [ + 1_000, + 2_000, + 8_000, + 100_000, + 1_000_000, + ], + ] + param_names = ["dtype", "M"] + + def setup(self, dtype, M): + self.s = pd.Series(np.arange(M)).astype(dtype) + self.values = np.arange(M).astype(dtype) + + def time_isin(self, dtype, M): + self.s.isin(self.values) + + +class IsinWithArange: + params = [ + [np.float64, np.int64, np.uint64, np.object], + [ + 1_000, + 2_000, + 8_000, + ], + [-2, 0, 2], + ] + param_names = ["dtype", "M", "offset_factor"] + + def setup(self, dtype, M, offset_factor): + offset = int(M * offset_factor) + np.random.seed(42) + tmp = pd.Series(np.random.randint(offset, M + offset, 10 ** 6)) + self.s = tmp.astype(dtype) + self.values = np.arange(M).astype(dtype) + + def time_isin(self, dtype, M, offset_factor): + self.s.isin(self.values) + + +class Float64GroupIndex: + # GH28303 + def setup(self): + self.df = pd.date_range( + start="1/1/2018", end="1/2/2018", periods=1e6 + ).to_frame() + self.group_index = np.round(self.df.index.astype(int) / 1e9) + + def time_groupby(self): + self.df.groupby(self.group_index).last() + + +class UniqueAndFactorizeArange: + params = range(4, 16) + param_names = ["exponent"] + + def setup(self, exponent): + a = np.arange(10 ** 4, dtype="float64") + self.a2 = (a + 10 ** exponent).repeat(100) + + def time_factorize(self, exponent): + pd.factorize(self.a2) + + def time_unique(self, exponent): + pd.unique(self.a2) + + +class NumericSeriesIndexing: + + params = [ + (pd.Int64Index, pd.UInt64Index, pd.Float64Index), + (10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6), + ] + param_names = ["index_dtype", "N"] + + def setup(self, index, N): + vals = np.array(list(range(55)) + [54] + list(range(55, N - 1))) + indices = index(vals) + self.data = pd.Series(np.arange(N), index=indices) + + def time_loc_slice(self, index, N): + # trigger building of mapping + self.data.loc[:800] + + +class NumericSeriesIndexingShuffled: + + params = [ + (pd.Int64Index, pd.UInt64Index, pd.Float64Index), + (10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6), + ] + param_names = ["index_dtype", "N"] + + def setup(self, index, N): + vals = np.array(list(range(55)) + [54] + list(range(55, N - 1))) + np.random.seed(42) + np.random.shuffle(vals) + indices = index(vals) + self.data = pd.Series(np.arange(N), index=indices) + + def time_loc_slice(self, index, N): + # trigger building of mapping + self.data.loc[:800] From 937c9ef69c62b2e6aaa13f2b44e168a4672f50da Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Mon, 5 Oct 2020 00:07:14 +0200 Subject: [PATCH 2/8] using murmur2 for probing step --- pandas/_libs/src/klib/khash.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index 916838d1e9584..5ac7552e367b1 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -143,10 +143,38 @@ typedef khint_t khiter_t; #define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i) #define __ac_set_isdel_true(flag, i) ((void)0) + +// specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp +khint32_t PANDAS_INLINE murmur2_32to32(khint32_t k){ + const khint32_t SEED = 0xc70f6907UL; + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + const khint32_t M_32 = 0x5bd1e995; + const int R_32 = 24; + + // Initialize the hash to a 'random' value + khint32_t h = SEED ^ 4; + + //handle 4 bytes: + k *= M_32; + k ^= k >> R_32; + k *= M_32; + + h *= M_32; + h ^= k; + + // Do a few final mixes of the hash to ensure the "last few + // bytes" are well-incorporated. (Really needed here?) + h ^= h >> 13; + h *= M_32; + h ^= h >> 15; + return h; +} + #ifdef KHASH_LINEAR #define __ac_inc(k, m) 1 #else -#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) +#define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m) #endif #define __ac_fsize(m) ((m) < 32? 1 : (m)>>5) From 3c32be61dc388445676b6704fc1067ad2fd68fc7 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 13 Oct 2020 21:30:23 +0200 Subject: [PATCH 3/8] [PERF] using murmur2 hash for float64 klib-hash-tables --- pandas/_libs/src/klib/khash.h | 48 ++++++++++++++++++++++++++++ pandas/_libs/src/klib/khash_python.h | 36 ++++++++++++--------- 2 files changed, 69 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index 5ac7552e367b1..61a4e80ea8cbc 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -171,6 +171,54 @@ khint32_t PANDAS_INLINE murmur2_32to32(khint32_t k){ return h; } +// it is possible to have a special x64-version, which would need less operations, but +// using 32bit version always has also some benifits: +// - one code for 32bit and 64bit builds +// - the same case for 32bit and 64bit builds +// - no performance difference could be measured compared to a possible x64-version + +khint32_t PANDAS_INLINE murmur2_32_32to32(khint32_t k1, khint32_t k2){ + const khint32_t SEED = 0xc70f6907UL; + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + const khint32_t M_32 = 0x5bd1e995; + const int R_32 = 24; + + // Initialize the hash to a 'random' value + khint32_t h = SEED ^ 4; + + //handle first 4 bytes: + k1 *= M_32; + k1 ^= k1 >> R_32; + k1 *= M_32; + + h *= M_32; + h ^= k1; + + //handle second 4 bytes: + k2 *= M_32; + k2 ^= k2 >> R_32; + k2 *= M_32; + + h *= M_32; + h ^= k2; + + // Do a few final mixes of the hash to ensure the "last few + // bytes" are well-incorporated. + h ^= h >> 13; + h *= M_32; + h ^= h >> 15; + return h; +} + +khint32_t PANDAS_INLINE murmur2_64to32(khint64_t k){ + khint32_t k1 = (khint32_t)k; + khint32_t k2 = (khint32_t)(k >> 32); + + return murmur2_32_32to32(k1, k2); +} + + #ifdef KHASH_LINEAR #define __ac_inc(k, m) 1 #else diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 2b46d30c3adb6..e50b09bd00d74 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -13,25 +13,31 @@ // is 64 bits the truncation causes collission issues. Given all that, we use our own // simple hash, viewing the double bytes as an int64 and using khash's default // hash for 64 bit integers. -// GH 13436 +// GH 13436 showed that _Py_HashDouble doesn't work well with khash +// GH 28303 showed, that the simple xoring-version isn't good enough +// thus murmur2-hash is used + khint64_t PANDAS_INLINE asint64(double key) { - khint64_t val; - memcpy(&val, &key, sizeof(double)); - return val; + khint64_t val; + memcpy(&val, &key, sizeof(double)); + return val; } -// correct for all inputs but not -0.0 and NaNs -#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11) - -// correct for all inputs but not NaNs -#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ? \ - kh_float64_hash_func_0_NAN(0.0) : \ - kh_float64_hash_func_0_NAN(key)) +#define ZERO_HASH 0 +#define NAN_HASH 0 -// correct for all -#define kh_float64_hash_func(key) ((key) != (key) ? \ - kh_float64_hash_func_NAN(Py_NAN) : \ - kh_float64_hash_func_NAN(key)) +khint32_t PANDAS_INLINE kh_float64_hash_func(double val){ + // 0.0 and -0.0 should have the same hash: + if (val == 0.0){ + return ZERO_HASH; + } + // all nans should have the same hash: + if ( val!=val ){ + return NAN_HASH; + } + khint64_t as_int = asint64(val); + return murmur2_64to32(as_int); +} #define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) From 450268b751b889b075b70af72ae0c18b4ff83b99 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Wed, 30 Sep 2020 22:09:49 +0200 Subject: [PATCH 4/8] fixing test cases: because the values were the same, the order was not unique/stable (obviously depends on hash-values of keys), ensure unique ordering --- pandas/tests/base/test_value_counts.py | 8 ++------ pandas/tests/test_algos.py | 22 ++++++++++------------ 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 1a6cba1ace35f..0c93ef242517b 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -232,18 +232,14 @@ def test_value_counts_datetime64(index_or_series): # with NaT s = df["dt"].copy() - s = klass(list(s.values) + [pd.NaT]) + s = klass(list(s.values) + [pd.NaT] * 4) result = s.value_counts() assert result.index.dtype == "datetime64[ns]" tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) - # GH 35922. NaN-like now sorts to the beginning of duplicate counts - idx = pd.to_datetime( - ["2010-01-01 00:00:00", "2008-09-09 00:00:00", pd.NaT, "2009-01-01 00:00:00"] - ) - expected_s = Series([3, 2, 1, 1], index=idx) + expected_s = pd.concat([pd.Series([4], index=DatetimeIndex([pd.NaT])), expected_s]) tm.assert_series_equal(result, expected_s) unique = s.unique() diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 88286448de900..34b7d0e73e914 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1173,12 +1173,12 @@ def test_dropna(self): ) tm.assert_series_equal( - Series([True, True, False, None]).value_counts(dropna=True), - Series([2, 1], index=[True, False]), + Series([True] * 3 + [False] * 2 + [None] * 5).value_counts(dropna=True), + Series([3, 2], index=[True, False]), ) tm.assert_series_equal( - Series([True, True, False, None]).value_counts(dropna=False), - Series([2, 1, 1], index=[True, np.nan, False]), + Series([True] * 5 + [False] * 3 + [None] * 2).value_counts(dropna=False), + Series([5, 3, 2], index=[True, False, np.nan]), ) tm.assert_series_equal( Series([10.3, 5.0, 5.0]).value_counts(dropna=True), @@ -1194,26 +1194,24 @@ def test_dropna(self): Series([2, 1], index=[5.0, 10.3]), ) - # 32-bit linux has a different ordering - if IS64: - result = Series([10.3, 5.0, 5.0, None]).value_counts(dropna=False) - expected = Series([2, 1, 1], index=[5.0, np.nan, 10.3]) - tm.assert_series_equal(result, expected) + result = Series([10.3, 10.3, 5.0, 5.0, 5.0, None]).value_counts(dropna=False) + expected = Series([3, 2, 1], index=[5.0, 10.3, np.nan]) + tm.assert_series_equal(result, expected) def test_value_counts_normalized(self): # GH12558 - s = Series([1, 2, np.nan, np.nan, np.nan]) + s = Series([1] * 2 + [2] * 3 + [np.nan] * 5) dtypes = (np.float64, object, "M8[ns]") for t in dtypes: s_typed = s.astype(t) result = s_typed.value_counts(normalize=True, dropna=False) expected = Series( - [0.6, 0.2, 0.2], index=Series([np.nan, 1.0, 2.0], dtype=t) + [0.5, 0.3, 0.2], index=Series([np.nan, 2.0, 1.0], dtype=t) ) tm.assert_series_equal(result, expected) result = s_typed.value_counts(normalize=True, dropna=True) - expected = Series([0.5, 0.5], index=Series([1.0, 2.0], dtype=t)) + expected = Series([0.6, 0.4], index=Series([2.0, 1.0], dtype=t)) tm.assert_series_equal(result, expected) def test_value_counts_uint64(self): From 641eda13223efa0b4ca630497759fb1ed59c0dc5 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 3 Nov 2020 06:06:33 +0100 Subject: [PATCH 5/8] fixing doctest: the order of 1,4,2 depends on hash and is not unique --- pandas/core/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 4760b92ad5fec..b3366cca37617 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -982,9 +982,9 @@ def value_counts( >>> index = pd.Index([3, 1, 2, 3, 4, np.nan]) >>> index.value_counts() 3.0 2 - 1.0 1 2.0 1 4.0 1 + 1.0 1 dtype: int64 With `normalize` set to `True`, returns the relative frequency by @@ -993,9 +993,9 @@ def value_counts( >>> s = pd.Series([3, 1, 2, 3, 4, np.nan]) >>> s.value_counts(normalize=True) 3.0 0.4 - 1.0 0.2 2.0 0.2 4.0 0.2 + 1.0 0.2 dtype: float64 **bins** @@ -1017,10 +1017,10 @@ def value_counts( >>> s.value_counts(dropna=False) 3.0 2 - 1.0 1 2.0 1 - 4.0 1 NaN 1 + 4.0 1 + 1.0 1 dtype: int64 """ result = value_counts( From 309c9d66e3e5ae3516643edd3133f1fbf5d40870 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 13 Oct 2020 21:41:20 +0200 Subject: [PATCH 6/8] adding whatsnew-note --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 09cb024cbd95c..0d08a01066cfc 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -376,6 +376,7 @@ Performance improvements - Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`) - faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`) - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`) +- Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`) .. --------------------------------------------------------------------------- From f2496381d1dd088df60f29e6a9604dce30abaf1f Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sun, 18 Oct 2020 06:37:42 +0200 Subject: [PATCH 7/8] requested doc changes --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/_libs/src/klib/khash_python.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 0d08a01066cfc..54ca602ff3c86 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -376,7 +376,7 @@ Performance improvements - Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`) - faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`) - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`) -- Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`) +- Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index e50b09bd00d74..aebc229abddd2 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -15,7 +15,7 @@ // hash for 64 bit integers. // GH 13436 showed that _Py_HashDouble doesn't work well with khash // GH 28303 showed, that the simple xoring-version isn't good enough -// thus murmur2-hash is used +// See GH 36729 for evaluation of the currently used murmur2-hash version khint64_t PANDAS_INLINE asint64(double key) { khint64_t val; From e743c79693aa7a62d5fef5c24cadd06590261011 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 3 Nov 2020 08:24:11 +0100 Subject: [PATCH 8/8] use Series rather than pd.Series --- pandas/tests/base/test_value_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 0c93ef242517b..e9713e38f9874 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -239,7 +239,7 @@ def test_value_counts_datetime64(index_or_series): tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) - expected_s = pd.concat([pd.Series([4], index=DatetimeIndex([pd.NaT])), expected_s]) + expected_s = pd.concat([Series([4], index=DatetimeIndex([pd.NaT])), expected_s]) tm.assert_series_equal(result, expected_s) unique = s.unique()