Skip to content

Commit 4cfa97a

Browse files
authored
PERF: using murmur hash for float64 khash-tables (#36729)
1 parent 524fc9c commit 4cfa97a

File tree

7 files changed

+279
-38
lines changed

7 files changed

+279
-38
lines changed
+164
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
import numpy as np
2+
3+
import pandas as pd
4+
5+
6+
class IsinAlmostFullWithRandomInt:
7+
params = [
8+
[np.float64, np.int64, np.uint64, np.object],
9+
range(10, 21),
10+
]
11+
param_names = ["dtype", "exponent"]
12+
13+
def setup(self, dtype, exponent):
14+
M = 3 * 2 ** (exponent - 2)
15+
# 0.77-the maximal share of occupied buckets
16+
np.random.seed(42)
17+
self.s = pd.Series(np.random.randint(0, M, M)).astype(dtype)
18+
self.values = np.random.randint(0, M, M).astype(dtype)
19+
self.values_outside = self.values + M
20+
21+
def time_isin(self, dtype, exponent):
22+
self.s.isin(self.values)
23+
24+
def time_isin_outside(self, dtype, exponent):
25+
self.s.isin(self.values_outside)
26+
27+
28+
class IsinWithRandomFloat:
29+
params = [
30+
[np.float64, np.object],
31+
[
32+
1_300,
33+
2_000,
34+
7_000,
35+
8_000,
36+
70_000,
37+
80_000,
38+
750_000,
39+
900_000,
40+
],
41+
]
42+
param_names = ["dtype", "M"]
43+
44+
def setup(self, dtype, M):
45+
np.random.seed(42)
46+
self.values = np.random.rand(M)
47+
self.s = pd.Series(self.values).astype(dtype)
48+
np.random.shuffle(self.values)
49+
self.values_outside = self.values + 0.1
50+
51+
def time_isin(self, dtype, M):
52+
self.s.isin(self.values)
53+
54+
def time_isin_outside(self, dtype, M):
55+
self.s.isin(self.values_outside)
56+
57+
58+
class IsinWithArangeSorted:
59+
params = [
60+
[np.float64, np.int64, np.uint64, np.object],
61+
[
62+
1_000,
63+
2_000,
64+
8_000,
65+
100_000,
66+
1_000_000,
67+
],
68+
]
69+
param_names = ["dtype", "M"]
70+
71+
def setup(self, dtype, M):
72+
self.s = pd.Series(np.arange(M)).astype(dtype)
73+
self.values = np.arange(M).astype(dtype)
74+
75+
def time_isin(self, dtype, M):
76+
self.s.isin(self.values)
77+
78+
79+
class IsinWithArange:
80+
params = [
81+
[np.float64, np.int64, np.uint64, np.object],
82+
[
83+
1_000,
84+
2_000,
85+
8_000,
86+
],
87+
[-2, 0, 2],
88+
]
89+
param_names = ["dtype", "M", "offset_factor"]
90+
91+
def setup(self, dtype, M, offset_factor):
92+
offset = int(M * offset_factor)
93+
np.random.seed(42)
94+
tmp = pd.Series(np.random.randint(offset, M + offset, 10 ** 6))
95+
self.s = tmp.astype(dtype)
96+
self.values = np.arange(M).astype(dtype)
97+
98+
def time_isin(self, dtype, M, offset_factor):
99+
self.s.isin(self.values)
100+
101+
102+
class Float64GroupIndex:
103+
# GH28303
104+
def setup(self):
105+
self.df = pd.date_range(
106+
start="1/1/2018", end="1/2/2018", periods=1e6
107+
).to_frame()
108+
self.group_index = np.round(self.df.index.astype(int) / 1e9)
109+
110+
def time_groupby(self):
111+
self.df.groupby(self.group_index).last()
112+
113+
114+
class UniqueAndFactorizeArange:
115+
params = range(4, 16)
116+
param_names = ["exponent"]
117+
118+
def setup(self, exponent):
119+
a = np.arange(10 ** 4, dtype="float64")
120+
self.a2 = (a + 10 ** exponent).repeat(100)
121+
122+
def time_factorize(self, exponent):
123+
pd.factorize(self.a2)
124+
125+
def time_unique(self, exponent):
126+
pd.unique(self.a2)
127+
128+
129+
class NumericSeriesIndexing:
130+
131+
params = [
132+
(pd.Int64Index, pd.UInt64Index, pd.Float64Index),
133+
(10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6),
134+
]
135+
param_names = ["index_dtype", "N"]
136+
137+
def setup(self, index, N):
138+
vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)))
139+
indices = index(vals)
140+
self.data = pd.Series(np.arange(N), index=indices)
141+
142+
def time_loc_slice(self, index, N):
143+
# trigger building of mapping
144+
self.data.loc[:800]
145+
146+
147+
class NumericSeriesIndexingShuffled:
148+
149+
params = [
150+
(pd.Int64Index, pd.UInt64Index, pd.Float64Index),
151+
(10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6),
152+
]
153+
param_names = ["index_dtype", "N"]
154+
155+
def setup(self, index, N):
156+
vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)))
157+
np.random.seed(42)
158+
np.random.shuffle(vals)
159+
indices = index(vals)
160+
self.data = pd.Series(np.arange(N), index=indices)
161+
162+
def time_loc_slice(self, index, N):
163+
# trigger building of mapping
164+
self.data.loc[:800]

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,7 @@ Performance improvements
424424
- Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`)
425425
- faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`)
426426
- Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
427+
- Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`)
427428

428429
.. ---------------------------------------------------------------------------
429430

pandas/_libs/src/klib/khash.h

+77-1
Original file line numberDiff line numberDiff line change
@@ -143,10 +143,86 @@ typedef khint_t khiter_t;
143143
#define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i)
144144
#define __ac_set_isdel_true(flag, i) ((void)0)
145145

146+
147+
// specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp
148+
khint32_t PANDAS_INLINE murmur2_32to32(khint32_t k){
149+
const khint32_t SEED = 0xc70f6907UL;
150+
// 'm' and 'r' are mixing constants generated offline.
151+
// They're not really 'magic', they just happen to work well.
152+
const khint32_t M_32 = 0x5bd1e995;
153+
const int R_32 = 24;
154+
155+
// Initialize the hash to a 'random' value
156+
khint32_t h = SEED ^ 4;
157+
158+
//handle 4 bytes:
159+
k *= M_32;
160+
k ^= k >> R_32;
161+
k *= M_32;
162+
163+
h *= M_32;
164+
h ^= k;
165+
166+
// Do a few final mixes of the hash to ensure the "last few
167+
// bytes" are well-incorporated. (Really needed here?)
168+
h ^= h >> 13;
169+
h *= M_32;
170+
h ^= h >> 15;
171+
return h;
172+
}
173+
174+
// it is possible to have a special x64-version, which would need less operations, but
175+
// using 32bit version always has also some benifits:
176+
// - one code for 32bit and 64bit builds
177+
// - the same case for 32bit and 64bit builds
178+
// - no performance difference could be measured compared to a possible x64-version
179+
180+
khint32_t PANDAS_INLINE murmur2_32_32to32(khint32_t k1, khint32_t k2){
181+
const khint32_t SEED = 0xc70f6907UL;
182+
// 'm' and 'r' are mixing constants generated offline.
183+
// They're not really 'magic', they just happen to work well.
184+
const khint32_t M_32 = 0x5bd1e995;
185+
const int R_32 = 24;
186+
187+
// Initialize the hash to a 'random' value
188+
khint32_t h = SEED ^ 4;
189+
190+
//handle first 4 bytes:
191+
k1 *= M_32;
192+
k1 ^= k1 >> R_32;
193+
k1 *= M_32;
194+
195+
h *= M_32;
196+
h ^= k1;
197+
198+
//handle second 4 bytes:
199+
k2 *= M_32;
200+
k2 ^= k2 >> R_32;
201+
k2 *= M_32;
202+
203+
h *= M_32;
204+
h ^= k2;
205+
206+
// Do a few final mixes of the hash to ensure the "last few
207+
// bytes" are well-incorporated.
208+
h ^= h >> 13;
209+
h *= M_32;
210+
h ^= h >> 15;
211+
return h;
212+
}
213+
214+
khint32_t PANDAS_INLINE murmur2_64to32(khint64_t k){
215+
khint32_t k1 = (khint32_t)k;
216+
khint32_t k2 = (khint32_t)(k >> 32);
217+
218+
return murmur2_32_32to32(k1, k2);
219+
}
220+
221+
146222
#ifdef KHASH_LINEAR
147223
#define __ac_inc(k, m) 1
148224
#else
149-
#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m)
225+
#define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m)
150226
#endif
151227

152228
#define __ac_fsize(m) ((m) < 32? 1 : (m)>>5)

pandas/_libs/src/klib/khash_python.h

+21-15
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,31 @@
1313
// is 64 bits the truncation causes collission issues. Given all that, we use our own
1414
// simple hash, viewing the double bytes as an int64 and using khash's default
1515
// hash for 64 bit integers.
16-
// GH 13436
16+
// GH 13436 showed that _Py_HashDouble doesn't work well with khash
17+
// GH 28303 showed, that the simple xoring-version isn't good enough
18+
// See GH 36729 for evaluation of the currently used murmur2-hash version
19+
1720
khint64_t PANDAS_INLINE asint64(double key) {
18-
khint64_t val;
19-
memcpy(&val, &key, sizeof(double));
20-
return val;
21+
khint64_t val;
22+
memcpy(&val, &key, sizeof(double));
23+
return val;
2124
}
2225

23-
// correct for all inputs but not -0.0 and NaNs
24-
#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
25-
26-
// correct for all inputs but not NaNs
27-
#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ? \
28-
kh_float64_hash_func_0_NAN(0.0) : \
29-
kh_float64_hash_func_0_NAN(key))
26+
#define ZERO_HASH 0
27+
#define NAN_HASH 0
3028

31-
// correct for all
32-
#define kh_float64_hash_func(key) ((key) != (key) ? \
33-
kh_float64_hash_func_NAN(Py_NAN) : \
34-
kh_float64_hash_func_NAN(key))
29+
khint32_t PANDAS_INLINE kh_float64_hash_func(double val){
30+
// 0.0 and -0.0 should have the same hash:
31+
if (val == 0.0){
32+
return ZERO_HASH;
33+
}
34+
// all nans should have the same hash:
35+
if ( val!=val ){
36+
return NAN_HASH;
37+
}
38+
khint64_t as_int = asint64(val);
39+
return murmur2_64to32(as_int);
40+
}
3541

3642
#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
3743

pandas/core/base.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -982,9 +982,9 @@ def value_counts(
982982
>>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
983983
>>> index.value_counts()
984984
3.0 2
985-
1.0 1
986985
2.0 1
987986
4.0 1
987+
1.0 1
988988
dtype: int64
989989
990990
With `normalize` set to `True`, returns the relative frequency by
@@ -993,9 +993,9 @@ def value_counts(
993993
>>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
994994
>>> s.value_counts(normalize=True)
995995
3.0 0.4
996-
1.0 0.2
997996
2.0 0.2
998997
4.0 0.2
998+
1.0 0.2
999999
dtype: float64
10001000
10011001
**bins**
@@ -1017,10 +1017,10 @@ def value_counts(
10171017
10181018
>>> s.value_counts(dropna=False)
10191019
3.0 2
1020-
1.0 1
10211020
2.0 1
1022-
4.0 1
10231021
NaN 1
1022+
4.0 1
1023+
1.0 1
10241024
dtype: int64
10251025
"""
10261026
result = value_counts(

pandas/tests/base/test_value_counts.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -232,18 +232,14 @@ def test_value_counts_datetime64(index_or_series):
232232

233233
# with NaT
234234
s = df["dt"].copy()
235-
s = klass(list(s.values) + [pd.NaT])
235+
s = klass(list(s.values) + [pd.NaT] * 4)
236236

237237
result = s.value_counts()
238238
assert result.index.dtype == "datetime64[ns]"
239239
tm.assert_series_equal(result, expected_s)
240240

241241
result = s.value_counts(dropna=False)
242-
# GH 35922. NaN-like now sorts to the beginning of duplicate counts
243-
idx = pd.to_datetime(
244-
["2010-01-01 00:00:00", "2008-09-09 00:00:00", pd.NaT, "2009-01-01 00:00:00"]
245-
)
246-
expected_s = Series([3, 2, 1, 1], index=idx)
242+
expected_s = pd.concat([Series([4], index=DatetimeIndex([pd.NaT])), expected_s])
247243
tm.assert_series_equal(result, expected_s)
248244

249245
unique = s.unique()

0 commit comments

Comments
 (0)