Skip to content

Commit 0d7fe27

Browse files
committed
BUG: issues with hash-function for Float64HashTable (GH21866)
The following issues 1) hash(0.0) != hash(-0.0) 2) hash(x) != hash(y) for different x,y which are nans are solved by setting: 1) hash(-0.0):=hash(0.0) 2) hash(x):=hash(np.nan) for every x which is nan
1 parent bdb6168 commit 0d7fe27

File tree

2 files changed

+46
-1
lines changed

2 files changed

+46
-1
lines changed

pandas/_libs/src/klib/khash_python.h

+12-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,18 @@ khint64_t PANDAS_INLINE asint64(double key) {
1919
memcpy(&val, &key, sizeof(double));
2020
return val;
2121
}
22-
#define kh_float64_hash_func(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
22+
23+
// correct for all inputs but not -0.0 and NaNs
24+
#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
25+
// correct for all inputs but not NaNs
26+
#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ? \
27+
kh_float64_hash_func_0_NAN(0.0) : \
28+
kh_float64_hash_func_0_NAN(key))
29+
// correct for all
30+
#define kh_float64_hash_func(key) ((key) != (key) ? \
31+
kh_float64_hash_func_NAN(NAN) : \
32+
kh_float64_hash_func_NAN(key))
33+
2334
#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
2435

2536
#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \

pandas/tests/test_algos.py

+34
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from numpy import nan
88
from datetime import datetime
99
from itertools import permutations
10+
import struct
1011
from pandas import (Series, Categorical, CategoricalIndex,
1112
Timestamp, DatetimeIndex, Index, IntervalIndex)
1213
import pandas as pd
@@ -1087,6 +1088,39 @@ def test_lookup_nan(self, writable):
10871088
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
10881089
dtype=np.int64))
10891090

1091+
def test_add_signed_zeros(self):
1092+
# default hash-function would lead to different hash-buckets
1093+
# for 0.0 and -0.0 if there are more than 2^30 hash-buckets
1094+
# the memory is not touched, so the system doesn't commit 16GB
1095+
m = ht.Float64HashTable(12 * 10**8)
1096+
# hashtable is prepared, now 0.0 and -0.0 might be
1097+
# in different buckets:
1098+
m.set_item(0.0, 0)
1099+
m.set_item(-0.0, 0)
1100+
assert len(m) == 1 # 0.0 and -0.0 are equivalent
1101+
1102+
def test_add_signed_nans(self):
1103+
# default hash function would lead to different hash-buckets
1104+
# for NAN and -NAN if there are more than 2^30 hash-buckets
1105+
# the memory is not touched, so the system doesn't commit 16GB
1106+
m = ht.Float64HashTable(12 * 10**8)
1107+
# hashtable is prepared, now np.nan and -np.nan might be
1108+
# in different buckets:
1109+
m.set_item(np.nan, 0)
1110+
m.set_item(-np.nan, 0)
1111+
assert len(m) == 1 # nan and -nan are equivalent
1112+
1113+
def test_add_different_nans(self):
1114+
# create different nans from bit-patterns:
1115+
NAN1 = struct.unpack("d", struct.pack("L", 0x7ff8000000000000))[0]
1116+
NAN2 = struct.unpack("d", struct.pack("L", 0x7ff8000000000001))[0]
1117+
# default hash function would lead to different hash-buckets
1118+
# for NAN1 and NAN2 even if there are only 4 buckets:
1119+
m = ht.Float64HashTable()
1120+
m.set_item(NAN1, 0)
1121+
m.set_item(NAN2, 0)
1122+
assert len(m) == 1 # NAN1 and NAN2 are equivalent
1123+
10901124
def test_lookup_overflow(self, writable):
10911125
xs = np.array([1, 2, 2**63], dtype=np.uint64)
10921126
# GH 21688 ensure we can deal with readonly memory views

0 commit comments

Comments
 (0)