BUG: issues with hash-function for Float64HashTable (GH21866)

realead · realead · commit 0d7fe27c1b75 · 2018-07-13T23:46:45.000+02:00
The following issues

   1)  hash(0.0) != hash(-0.0)
   2)  hash(x) != hash(y) for different x,y which are nans

are solved by setting:

   1) hash(-0.0):=hash(0.0)
   2) hash(x):=hash(np.nan) for every x which is nan
diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h
@@ -19,7 +19,18 @@ khint64_t PANDAS_INLINE asint64(double key) {
   memcpy(&val, &key, sizeof(double));
   return val;
 }
-#define kh_float64_hash_func(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
+
+// correct for all inputs but not -0.0 and NaNs
+#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
+// correct for all inputs but not NaNs
+#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ?                       \
+                                        kh_float64_hash_func_0_NAN(0.0) : \
+                                        kh_float64_hash_func_0_NAN(key))
+// correct for all
+#define kh_float64_hash_func(key) ((key) != (key) ?                       \
+                                   kh_float64_hash_func_NAN(NAN) :        \
+                                   kh_float64_hash_func_NAN(key))
+
 #define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
 
 #define KHASH_MAP_INIT_FLOAT64(name, khval_t)								\
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -7,6 +7,7 @@
 from numpy import nan
 from datetime import datetime
 from itertools import permutations
+import struct
 from pandas import (Series, Categorical, CategoricalIndex,
                     Timestamp, DatetimeIndex, Index, IntervalIndex)
 import pandas as pd
@@ -1087,6 +1088,39 @@ def test_lookup_nan(self, writable):
         tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
                                                             dtype=np.int64))
 
+    def test_add_signed_zeros(self):
+        # default hash-function would lead to different hash-buckets
+        # for 0.0 and -0.0 if there are more than 2^30 hash-buckets
+        # the memory is not touched, so the system doesn't commit 16GB
+        m = ht.Float64HashTable(12 * 10**8)
+        # hashtable is prepared, now 0.0 and -0.0 might be
+        # in different buckets:
+        m.set_item(0.0, 0)
+        m.set_item(-0.0, 0)
+        assert len(m) == 1  # 0.0 and -0.0 are equivalent
+
+    def test_add_signed_nans(self):
+        # default hash function would lead to different hash-buckets
+        # for NAN and -NAN if there are more than 2^30 hash-buckets
+        # the memory is not touched, so the system doesn't commit 16GB
+        m = ht.Float64HashTable(12 * 10**8)
+        # hashtable is prepared, now np.nan and -np.nan might be
+        # in different buckets:
+        m.set_item(np.nan, 0)
+        m.set_item(-np.nan, 0)
+        assert len(m) == 1  # nan and -nan are equivalent
+
+    def test_add_different_nans(self):
+        # create different nans from bit-patterns:
+        NAN1 = struct.unpack("d", struct.pack("L", 0x7ff8000000000000))[0]
+        NAN2 = struct.unpack("d", struct.pack("L", 0x7ff8000000000001))[0]
+        # default hash function would lead to different hash-buckets
+        # for NAN1 and NAN2 even if there are only 4 buckets:
+        m = ht.Float64HashTable()
+        m.set_item(NAN1, 0)
+        m.set_item(NAN2, 0)
+        assert len(m) == 1  # NAN1 and NAN2 are equivalent
+
     def test_lookup_overflow(self, writable):
         xs = np.array([1, 2, 2**63], dtype=np.uint64)
         # GH 21688 ensure we can deal with readonly memory views