Skip to content

Commit 009d000

Browse files
committed
BUG: we don't like hash collisions in siphash
xref pandas-dev#14767
1 parent 53bf1b2 commit 009d000

File tree

1 file changed

+32
-4
lines changed

1 file changed

+32
-4
lines changed

pandas/tools/tests/test_hashing.py

+32-4
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,35 @@ def test_alternate_encoding(self):
142142
obj = Series(list('abc'))
143143
self.check_equal(obj, encoding='ascii')
144144

145-
def test_long_strings(self):
146-
147-
obj = Index(tm.rands_array(nchars=10000, size=100))
148-
self.check_equal(obj)
145+
def test_same_len_hash_collisions(self):
146+
147+
for l in range(8):
148+
length = 2**(l + 8) + 1
149+
s = tm.rands_array(length, 2)
150+
result = hash_array(s, 'utf8')
151+
self.assertFalse(result[0] == result[1])
152+
153+
for l in range(8):
154+
length = 2**(l + 8)
155+
s = tm.rands_array(length, 2)
156+
result = hash_array(s, 'utf8')
157+
self.assertFalse(result[0] == result[1])
158+
159+
def test_hash_collisions(self):
160+
161+
# hash collisions are bad
162+
# https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
163+
L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9', # noqa
164+
'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe'] # noqa
165+
166+
# these should be different!
167+
result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8')
168+
expected1 = np.array([1760245841805064774], dtype=np.uint64)
169+
self.assert_numpy_array_equal(result1, expected1)
170+
171+
result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8')
172+
expected2 = np.array([1760245841805064774], dtype=np.uint64)
173+
self.assert_numpy_array_equal(result2, expected2)
174+
175+
result = hash_array(np.asarray(L, dtype=object), 'utf8')
176+
self.assertTrue(len(result)) == 2

0 commit comments

Comments
 (0)