Skip to content

Commit 2c7c797

Browse files
realeadjbrockmendel
authored andcommitted
BUG: issues with hash-function for Float64HashTable (GH21866) (#21904)
* BUG: issues with hash-function for Float64HashTable (GH21866) The following issues 1) hash(0.0) != hash(-0.0) 2) hash(x) != hash(y) for different x,y which are nans are solved by setting: 1) hash(-0.0):=hash(0.0) 2) hash(x):=hash(np.nan) for every x which is nan * add the id of the issue to tests
1 parent 600b9c8 commit 2c7c797

File tree

3 files changed

+60
-1
lines changed

3 files changed

+60
-1
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,7 @@ Numeric
474474

475475
- Bug in :class:`Series` ``__rmatmul__`` doesn't support matrix vector multiplication (:issue:`21530`)
476476
- Bug in :func:`factorize` fails with read-only array (:issue:`12813`)
477+
- Fixed bug in :func:`unique` handled signed zeros inconsistently: for some inputs 0.0 and -0.0 were treated as equal and for some inputs as different. Now they are treated as equal for all inputs (:issue:`21866`)
477478
-
478479
-
479480

pandas/_libs/src/klib/khash_python.h

+14-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,20 @@ khint64_t PANDAS_INLINE asint64(double key) {
1919
memcpy(&val, &key, sizeof(double));
2020
return val;
2121
}
22-
#define kh_float64_hash_func(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
22+
23+
// correct for all inputs but not -0.0 and NaNs
24+
#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
25+
26+
// correct for all inputs but not NaNs
27+
#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ? \
28+
kh_float64_hash_func_0_NAN(0.0) : \
29+
kh_float64_hash_func_0_NAN(key))
30+
31+
// correct for all
32+
#define kh_float64_hash_func(key) ((key) != (key) ? \
33+
kh_float64_hash_func_NAN(Py_NAN) : \
34+
kh_float64_hash_func_NAN(key))
35+
2336
#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
2437

2538
#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \

pandas/tests/test_algos.py

+45
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from numpy import nan
88
from datetime import datetime
99
from itertools import permutations
10+
import struct
1011
from pandas import (Series, Categorical, CategoricalIndex,
1112
Timestamp, DatetimeIndex, Index, IntervalIndex)
1213
import pandas as pd
@@ -500,6 +501,25 @@ def test_obj_none_preservation(self):
500501

501502
tm.assert_numpy_array_equal(result, expected, strict_nan=True)
502503

504+
def test_signed_zero(self):
505+
# GH 21866
506+
a = np.array([-0.0, 0.0])
507+
result = pd.unique(a)
508+
expected = np.array([-0.0]) # 0.0 and -0.0 are equivalent
509+
tm.assert_numpy_array_equal(result, expected)
510+
511+
def test_different_nans(self):
512+
# GH 21866
513+
# create different nans from bit-patterns:
514+
NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
515+
NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
516+
assert NAN1 != NAN1
517+
assert NAN2 != NAN2
518+
a = np.array([NAN1, NAN2]) # NAN1 and NAN2 are equivalent
519+
result = pd.unique(a)
520+
expected = np.array([np.nan])
521+
tm.assert_numpy_array_equal(result, expected)
522+
503523

504524
class TestIsin(object):
505525

@@ -1087,6 +1107,31 @@ def test_lookup_nan(self, writable):
10871107
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
10881108
dtype=np.int64))
10891109

1110+
def test_add_signed_zeros(self):
1111+
# GH 21866 inconsistent hash-function for float64
1112+
# default hash-function would lead to different hash-buckets
1113+
# for 0.0 and -0.0 if there are more than 2^30 hash-buckets
1114+
# but this would mean 16GB
1115+
N = 4 # 12 * 10**8 would trigger the error, if you have enough memory
1116+
m = ht.Float64HashTable(N)
1117+
m.set_item(0.0, 0)
1118+
m.set_item(-0.0, 0)
1119+
assert len(m) == 1 # 0.0 and -0.0 are equivalent
1120+
1121+
def test_add_different_nans(self):
1122+
# GH 21866 inconsistent hash-function for float64
1123+
# create different nans from bit-patterns:
1124+
NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
1125+
NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
1126+
assert NAN1 != NAN1
1127+
assert NAN2 != NAN2
1128+
# default hash function would lead to different hash-buckets
1129+
# for NAN1 and NAN2 even if there are only 4 buckets:
1130+
m = ht.Float64HashTable()
1131+
m.set_item(NAN1, 0)
1132+
m.set_item(NAN2, 0)
1133+
assert len(m) == 1 # NAN1 and NAN2 are equivalent
1134+
10901135
def test_lookup_overflow(self, writable):
10911136
xs = np.array([1, 2, 2**63], dtype=np.uint64)
10921137
# GH 21688 ensure we can deal with readonly memory views

0 commit comments

Comments
 (0)