@@ -142,7 +142,36 @@ def test_alternate_encoding(self):
142
142
obj = Series (list ('abc' ))
143
143
self .check_equal (obj , encoding = 'ascii' )
144
144
145
- def test_long_strings (self ):
146
-
147
- obj = Index (tm .rands_array (nchars = 10000 , size = 100 ))
148
- self .check_equal (obj )
145
+ def test_same_len_hash_collisions (self ):
146
+
147
+ for l in range (8 ):
148
+ length = 2 ** (l + 8 ) + 1
149
+ s = tm .rands_array (length , 2 )
150
+ result = hash_array (s , 'utf8' )
151
+ self .assertFalse (result [0 ] == result [1 ])
152
+
153
+ for l in range (8 ):
154
+ length = 2 ** (l + 8 )
155
+ s = tm .rands_array (length , 2 )
156
+ result = hash_array (s , 'utf8' )
157
+ self .assertFalse (result [0 ] == result [1 ])
158
+
159
+ def test_hash_collisions (self ):
160
+
161
+ # hash collisions are bad
162
+ # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
163
+ L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9' , # noqa
164
+ 'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe' ] # noqa
165
+
166
+ # these should be different!
167
+ result1 = hash_array (np .asarray (L [0 :1 ], dtype = object ), 'utf8' )
168
+ expected1 = np .array ([14963968704024874985 ], dtype = np .uint64 )
169
+ self .assert_numpy_array_equal (result1 , expected1 )
170
+
171
+ result2 = hash_array (np .asarray (L [1 :2 ], dtype = object ), 'utf8' )
172
+ expected2 = np .array ([16428432627716348016 ], dtype = np .uint64 )
173
+ self .assert_numpy_array_equal (result2 , expected2 )
174
+
175
+ result = hash_array (np .asarray (L , dtype = object ), 'utf8' )
176
+ self .assert_numpy_array_equal (
177
+ result , np .concatenate ([expected1 , expected2 ], axis = 0 ))
0 commit comments