Skip to content

PERF: using murmur hash for float64 khash-tables #36729

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Nov 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions asv_bench/benchmarks/hash_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import numpy as np

import pandas as pd


class IsinAlmostFullWithRandomInt:
params = [
[np.float64, np.int64, np.uint64, np.object],
range(10, 21),
]
param_names = ["dtype", "exponent"]

def setup(self, dtype, exponent):
M = 3 * 2 ** (exponent - 2)
# 0.77-the maximal share of occupied buckets
np.random.seed(42)
self.s = pd.Series(np.random.randint(0, M, M)).astype(dtype)
self.values = np.random.randint(0, M, M).astype(dtype)
self.values_outside = self.values + M

def time_isin(self, dtype, exponent):
self.s.isin(self.values)

def time_isin_outside(self, dtype, exponent):
self.s.isin(self.values_outside)


class IsinWithRandomFloat:
params = [
[np.float64, np.object],
[
1_300,
2_000,
7_000,
8_000,
70_000,
80_000,
750_000,
900_000,
],
]
param_names = ["dtype", "M"]

def setup(self, dtype, M):
np.random.seed(42)
self.values = np.random.rand(M)
self.s = pd.Series(self.values).astype(dtype)
np.random.shuffle(self.values)
self.values_outside = self.values + 0.1

def time_isin(self, dtype, M):
self.s.isin(self.values)

def time_isin_outside(self, dtype, M):
self.s.isin(self.values_outside)


class IsinWithArangeSorted:
params = [
[np.float64, np.int64, np.uint64, np.object],
[
1_000,
2_000,
8_000,
100_000,
1_000_000,
],
]
param_names = ["dtype", "M"]

def setup(self, dtype, M):
self.s = pd.Series(np.arange(M)).astype(dtype)
self.values = np.arange(M).astype(dtype)

def time_isin(self, dtype, M):
self.s.isin(self.values)


class IsinWithArange:
params = [
[np.float64, np.int64, np.uint64, np.object],
[
1_000,
2_000,
8_000,
],
[-2, 0, 2],
]
param_names = ["dtype", "M", "offset_factor"]

def setup(self, dtype, M, offset_factor):
offset = int(M * offset_factor)
np.random.seed(42)
tmp = pd.Series(np.random.randint(offset, M + offset, 10 ** 6))
self.s = tmp.astype(dtype)
self.values = np.arange(M).astype(dtype)

def time_isin(self, dtype, M, offset_factor):
self.s.isin(self.values)


class Float64GroupIndex:
# GH28303
def setup(self):
self.df = pd.date_range(
start="1/1/2018", end="1/2/2018", periods=1e6
).to_frame()
self.group_index = np.round(self.df.index.astype(int) / 1e9)

def time_groupby(self):
self.df.groupby(self.group_index).last()


class UniqueAndFactorizeArange:
params = range(4, 16)
param_names = ["exponent"]

def setup(self, exponent):
a = np.arange(10 ** 4, dtype="float64")
self.a2 = (a + 10 ** exponent).repeat(100)

def time_factorize(self, exponent):
pd.factorize(self.a2)

def time_unique(self, exponent):
pd.unique(self.a2)


class NumericSeriesIndexing:

params = [
(pd.Int64Index, pd.UInt64Index, pd.Float64Index),
(10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6),
]
param_names = ["index_dtype", "N"]

def setup(self, index, N):
vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)))
indices = index(vals)
self.data = pd.Series(np.arange(N), index=indices)

def time_loc_slice(self, index, N):
# trigger building of mapping
self.data.loc[:800]


class NumericSeriesIndexingShuffled:

params = [
(pd.Int64Index, pd.UInt64Index, pd.Float64Index),
(10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6),
]
param_names = ["index_dtype", "N"]

def setup(self, index, N):
vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)))
np.random.seed(42)
np.random.shuffle(vals)
indices = index(vals)
self.data = pd.Series(np.arange(N), index=indices)

def time_loc_slice(self, index, N):
# trigger building of mapping
self.data.loc[:800]
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ Performance improvements
- Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`)
- faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`)
- Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
- Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`)

.. ---------------------------------------------------------------------------

Expand Down
78 changes: 77 additions & 1 deletion pandas/_libs/src/klib/khash.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,86 @@ typedef khint_t khiter_t;
#define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i)
#define __ac_set_isdel_true(flag, i) ((void)0)


// specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp
khint32_t PANDAS_INLINE murmur2_32to32(khint32_t k){
const khint32_t SEED = 0xc70f6907UL;
// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
const khint32_t M_32 = 0x5bd1e995;
const int R_32 = 24;

// Initialize the hash to a 'random' value
khint32_t h = SEED ^ 4;

//handle 4 bytes:
k *= M_32;
k ^= k >> R_32;
k *= M_32;

h *= M_32;
h ^= k;

// Do a few final mixes of the hash to ensure the "last few
// bytes" are well-incorporated. (Really needed here?)
h ^= h >> 13;
h *= M_32;
h ^= h >> 15;
return h;
}

// it is possible to have a special x64-version, which would need less operations, but
// using 32bit version always has also some benifits:
// - one code for 32bit and 64bit builds
// - the same case for 32bit and 64bit builds
// - no performance difference could be measured compared to a possible x64-version

khint32_t PANDAS_INLINE murmur2_32_32to32(khint32_t k1, khint32_t k2){
const khint32_t SEED = 0xc70f6907UL;
// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
const khint32_t M_32 = 0x5bd1e995;
const int R_32 = 24;

// Initialize the hash to a 'random' value
khint32_t h = SEED ^ 4;

//handle first 4 bytes:
k1 *= M_32;
k1 ^= k1 >> R_32;
k1 *= M_32;

h *= M_32;
h ^= k1;

//handle second 4 bytes:
k2 *= M_32;
k2 ^= k2 >> R_32;
k2 *= M_32;

h *= M_32;
h ^= k2;

// Do a few final mixes of the hash to ensure the "last few
// bytes" are well-incorporated.
h ^= h >> 13;
h *= M_32;
h ^= h >> 15;
return h;
}

khint32_t PANDAS_INLINE murmur2_64to32(khint64_t k){
khint32_t k1 = (khint32_t)k;
khint32_t k2 = (khint32_t)(k >> 32);

return murmur2_32_32to32(k1, k2);
}


#ifdef KHASH_LINEAR
#define __ac_inc(k, m) 1
#else
#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m)
#define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m)
#endif

#define __ac_fsize(m) ((m) < 32? 1 : (m)>>5)
Expand Down
36 changes: 21 additions & 15 deletions pandas/_libs/src/klib/khash_python.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,31 @@
// is 64 bits the truncation causes collission issues. Given all that, we use our own
// simple hash, viewing the double bytes as an int64 and using khash's default
// hash for 64 bit integers.
// GH 13436
// GH 13436 showed that _Py_HashDouble doesn't work well with khash
// GH 28303 showed, that the simple xoring-version isn't good enough
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a tiny bit more content here on why this appropach vs the CPython appropach.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was a late comment, pls update in a followon.

// See GH 36729 for evaluation of the currently used murmur2-hash version

khint64_t PANDAS_INLINE asint64(double key) {
khint64_t val;
memcpy(&val, &key, sizeof(double));
return val;
khint64_t val;
memcpy(&val, &key, sizeof(double));
return val;
}

// correct for all inputs but not -0.0 and NaNs
#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)

// correct for all inputs but not NaNs
#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ? \
kh_float64_hash_func_0_NAN(0.0) : \
kh_float64_hash_func_0_NAN(key))
#define ZERO_HASH 0
#define NAN_HASH 0

// correct for all
#define kh_float64_hash_func(key) ((key) != (key) ? \
kh_float64_hash_func_NAN(Py_NAN) : \
kh_float64_hash_func_NAN(key))
khint32_t PANDAS_INLINE kh_float64_hash_func(double val){
// 0.0 and -0.0 should have the same hash:
if (val == 0.0){
return ZERO_HASH;
}
// all nans should have the same hash:
if ( val!=val ){
return NAN_HASH;
}
khint64_t as_int = asint64(val);
return murmur2_64to32(as_int);
}

#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))

Expand Down
8 changes: 4 additions & 4 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -982,9 +982,9 @@ def value_counts(
>>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
>>> index.value_counts()
3.0 2
1.0 1
2.0 1
4.0 1
1.0 1
dtype: int64

With `normalize` set to `True`, returns the relative frequency by
Expand All @@ -993,9 +993,9 @@ def value_counts(
>>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
>>> s.value_counts(normalize=True)
3.0 0.4
1.0 0.2
2.0 0.2
4.0 0.2
1.0 0.2
dtype: float64

**bins**
Expand All @@ -1017,10 +1017,10 @@ def value_counts(

>>> s.value_counts(dropna=False)
3.0 2
1.0 1
2.0 1
4.0 1
NaN 1
4.0 1
1.0 1
dtype: int64
"""
result = value_counts(
Expand Down
8 changes: 2 additions & 6 deletions pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,18 +232,14 @@ def test_value_counts_datetime64(index_or_series):

# with NaT
s = df["dt"].copy()
s = klass(list(s.values) + [pd.NaT])
s = klass(list(s.values) + [pd.NaT] * 4)

result = s.value_counts()
assert result.index.dtype == "datetime64[ns]"
tm.assert_series_equal(result, expected_s)

result = s.value_counts(dropna=False)
# GH 35922. NaN-like now sorts to the beginning of duplicate counts
idx = pd.to_datetime(
["2010-01-01 00:00:00", "2008-09-09 00:00:00", pd.NaT, "2009-01-01 00:00:00"]
)
expected_s = Series([3, 2, 1, 1], index=idx)
expected_s = pd.concat([Series([4], index=DatetimeIndex([pd.NaT])), expected_s])
tm.assert_series_equal(result, expected_s)

unique = s.unique()
Expand Down
Loading