diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 8fe3023e9537c..b723e9cc6dca8 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -471,6 +471,7 @@ Numeric - Bug in :class:`Series` ``__rmatmul__`` doesn't support matrix vector multiplication (:issue:`21530`) - Bug in :func:`factorize` fails with read-only array (:issue:`12813`) +- Fixed bug in :func:`unique` handled signed zeros inconsistently: for some inputs 0.0 and -0.0 were treated as equal and for some inputs as different. Now they are treated as equal for all inputs (:issue:`21866`) - - diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index dd75ae5ec7e28..e9fb49e8a5e42 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -19,7 +19,20 @@ khint64_t PANDAS_INLINE asint64(double key) { memcpy(&val, &key, sizeof(double)); return val; } -#define kh_float64_hash_func(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11) + +// correct for all inputs but not -0.0 and NaNs +#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11) + +// correct for all inputs but not NaNs +#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ? \ + kh_float64_hash_func_0_NAN(0.0) : \ + kh_float64_hash_func_0_NAN(key)) + +// correct for all +#define kh_float64_hash_func(key) ((key) != (key) ? \ + kh_float64_hash_func_NAN(Py_NAN) : \ + kh_float64_hash_func_NAN(key)) + #define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) #define KHASH_MAP_INIT_FLOAT64(name, khval_t) \ diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 25e64aa82cc36..3e754355bcb26 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -7,6 +7,7 @@ from numpy import nan from datetime import datetime from itertools import permutations +import struct from pandas import (Series, Categorical, CategoricalIndex, Timestamp, DatetimeIndex, Index, IntervalIndex) import pandas as pd @@ -500,6 +501,25 @@ def test_obj_none_preservation(self): tm.assert_numpy_array_equal(result, expected, strict_nan=True) + def test_signed_zero(self): + # GH 21866 + a = np.array([-0.0, 0.0]) + result = pd.unique(a) + expected = np.array([-0.0]) # 0.0 and -0.0 are equivalent + tm.assert_numpy_array_equal(result, expected) + + def test_different_nans(self): + # GH 21866 + # create different nans from bit-patterns: + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0] + assert NAN1 != NAN1 + assert NAN2 != NAN2 + a = np.array([NAN1, NAN2]) # NAN1 and NAN2 are equivalent + result = pd.unique(a) + expected = np.array([np.nan]) + tm.assert_numpy_array_equal(result, expected) + class TestIsin(object): @@ -1087,6 +1107,31 @@ def test_lookup_nan(self, writable): tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.int64)) + def test_add_signed_zeros(self): + # GH 21866 inconsistent hash-function for float64 + # default hash-function would lead to different hash-buckets + # for 0.0 and -0.0 if there are more than 2^30 hash-buckets + # but this would mean 16GB + N = 4 # 12 * 10**8 would trigger the error, if you have enough memory + m = ht.Float64HashTable(N) + m.set_item(0.0, 0) + m.set_item(-0.0, 0) + assert len(m) == 1 # 0.0 and -0.0 are equivalent + + def test_add_different_nans(self): + # GH 21866 inconsistent hash-function for float64 + # create different nans from bit-patterns: + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0] + assert NAN1 != NAN1 + assert NAN2 != NAN2 + # default hash function would lead to different hash-buckets + # for NAN1 and NAN2 even if there are only 4 buckets: + m = ht.Float64HashTable() + m.set_item(NAN1, 0) + m.set_item(NAN2, 0) + assert len(m) == 1 # NAN1 and NAN2 are equivalent + def test_lookup_overflow(self, writable): xs = np.array([1, 2, 2**63], dtype=np.uint64) # GH 21688 ensure we can deal with readonly memory views