From 009d000a246bfd8fde7e3be65f70de1bf2a540fb Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 5 Dec 2016 11:49:25 -0500 Subject: [PATCH 1/3] BUG: we don't like hash collisions in siphash xref #14767 --- pandas/tools/tests/test_hashing.py | 36 ++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 4e05ae7007c80..fe80ced5e1563 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -142,7 +142,35 @@ def test_alternate_encoding(self): obj = Series(list('abc')) self.check_equal(obj, encoding='ascii') - def test_long_strings(self): - - obj = Index(tm.rands_array(nchars=10000, size=100)) - self.check_equal(obj) + def test_same_len_hash_collisions(self): + + for l in range(8): + length = 2**(l + 8) + 1 + s = tm.rands_array(length, 2) + result = hash_array(s, 'utf8') + self.assertFalse(result[0] == result[1]) + + for l in range(8): + length = 2**(l + 8) + s = tm.rands_array(length, 2) + result = hash_array(s, 'utf8') + self.assertFalse(result[0] == result[1]) + + def test_hash_collisions(self): + + # hash collisions are bad + # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726 + L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9', # noqa + 'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe'] # noqa + + # these should be different! + result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8') + expected1 = np.array([1760245841805064774], dtype=np.uint64) + self.assert_numpy_array_equal(result1, expected1) + + result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8') + expected2 = np.array([1760245841805064774], dtype=np.uint64) + self.assert_numpy_array_equal(result2, expected2) + + result = hash_array(np.asarray(L, dtype=object), 'utf8') + self.assertTrue(len(result)) == 2 From 378cffbbf28055feb61d31777c2c2389350ef741 Mon Sep 17 00:00:00 2001 From: Mike Graham Date: Mon, 5 Dec 2016 19:00:25 -0500 Subject: [PATCH 2/3] This should be a 64-bit int, not an 8-bit int --- pandas/src/hash.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/src/hash.pyx b/pandas/src/hash.pyx index a393e0df96954..06ed947808e39 100644 --- a/pandas/src/hash.pyx +++ b/pandas/src/hash.pyx @@ -40,7 +40,8 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): Py_ssize_t i, l, n ndarray[uint64_t] result bytes data, k - uint8_t *kb, *lens + uint8_t *kb + uint64_t *lens char **vecs, *cdata object val @@ -55,7 +56,7 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): # create an array of bytes vecs = malloc(n * sizeof(char *)) - lens = malloc(n * sizeof(uint8_t)) + lens = malloc(n * sizeof(uint64_t)) cdef list datas = [] for i in range(n): From 707becfc8bf0d5d8ac615696037b03300d5f65b1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 5 Dec 2016 20:15:20 -0500 Subject: [PATCH 3/3] fix tests --- pandas/tools/tests/test_hashing.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index fe80ced5e1563..6e5f30fb7a52d 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -165,12 +165,13 @@ def test_hash_collisions(self): # these should be different! result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8') - expected1 = np.array([1760245841805064774], dtype=np.uint64) + expected1 = np.array([14963968704024874985], dtype=np.uint64) self.assert_numpy_array_equal(result1, expected1) result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8') - expected2 = np.array([1760245841805064774], dtype=np.uint64) + expected2 = np.array([16428432627716348016], dtype=np.uint64) self.assert_numpy_array_equal(result2, expected2) result = hash_array(np.asarray(L, dtype=object), 'utf8') - self.assertTrue(len(result)) == 2 + self.assert_numpy_array_equal( + result, np.concatenate([expected1, expected2], axis=0))