13
13
14
14
class TestHashing (object ):
15
15
16
- def setup_method ( self , method ):
17
- self . df = DataFrame (
18
- { 'i32' : np . array ([ 1 , 2 , 3 ] * 3 , dtype = 'int32 ' ),
19
- 'f32' : np . array ([ None , 2.5 , 3.5 ] * 3 , dtype = 'float32 ' ),
20
- 'cat' : Series (['a ' , 'b ' , 'c ' ] * 3 ). astype ( 'category' ),
21
- 'obj' : Series (['d' , 'e' , 'f' ] * 3 ),
22
- 'bool' : np . array ([ True , False , True ] * 3 ),
23
- 'dt' : Series (pd .date_range ('20130101' , periods = 9 )),
24
- 'dt_tz' : Series (pd .date_range ( '20130101 ' , periods = 9 ,
25
- tz = 'US/Eastern' )),
26
- 'td' : Series ( pd . timedelta_range ( '2000' , periods = 9 ))})
16
+ @ pytest . fixture ( params = [
17
+ Series ([ 1 , 2 , 3 ] * 3 , dtype = 'int32' ),
18
+ Series ([ None , 2.5 , 3.5 ] * 3 , dtype = 'float32 ' ),
19
+ Series ([ 'a' , 'b' , 'c' ] * 3 , dtype = 'category ' ),
20
+ Series (['d ' , 'e ' , 'f ' ] * 3 ),
21
+ Series ([True , False , True ] * 3 ),
22
+ Series ( pd . date_range ( '20130101' , periods = 9 ) ),
23
+ Series (pd .date_range ('20130101' , periods = 9 , tz = 'US/Eastern' )),
24
+ Series (pd .timedelta_range ( '2000 ' , periods = 9 ))])
25
+ def series ( self , request ):
26
+ return request . param
27
27
28
28
def test_consistency (self ):
29
29
# check that our hash doesn't change because of a mistake
@@ -34,10 +34,9 @@ def test_consistency(self):
34
34
index = ['foo' , 'bar' , 'baz' ])
35
35
tm .assert_series_equal (result , expected )
36
36
37
- def test_hash_array (self ):
38
- for name , s in self .df .iteritems ():
39
- a = s .values
40
- tm .assert_numpy_array_equal (hash_array (a ), hash_array (a ))
37
+ def test_hash_array (self , series ):
38
+ a = series .values
39
+ tm .assert_numpy_array_equal (hash_array (a ), hash_array (a ))
41
40
42
41
def test_hash_array_mixed (self ):
43
42
result1 = hash_array (np .array ([3 , 4 , 'All' ]))
@@ -46,10 +45,11 @@ def test_hash_array_mixed(self):
46
45
tm .assert_numpy_array_equal (result1 , result2 )
47
46
tm .assert_numpy_array_equal (result1 , result3 )
48
47
49
- def test_hash_array_errors (self ):
50
-
51
- for val in [5 , 'foo' , pd .Timestamp ('20130101' )]:
52
- pytest .raises (TypeError , hash_array , val )
48
+ @pytest .mark .parametrize ('val' , [5 , 'foo' , pd .Timestamp ('20130101' )])
49
+ def test_hash_array_errors (self , val ):
50
+ msg = 'must pass a ndarray-like'
51
+ with tm .assert_raises_regex (TypeError , msg ):
52
+ hash_array (val )
53
53
54
54
def check_equal (self , obj , ** kwargs ):
55
55
a = hash_pandas_object (obj , ** kwargs )
@@ -80,31 +80,33 @@ def test_hash_tuples(self):
80
80
result = hash_tuples (tups [0 ])
81
81
assert result == expected [0 ]
82
82
83
- def test_hash_tuple (self ):
83
+ @pytest .mark .parametrize ('tup' , [
84
+ (1 , 'one' ), (1 , np .nan ), (1.0 , pd .NaT , 'A' ),
85
+ ('A' , pd .Timestamp ("2012-01-01" ))])
86
+ def test_hash_tuple (self , tup ):
84
87
# test equivalence between hash_tuples and hash_tuple
85
- for tup in [(1 , 'one' ), (1 , np .nan ), (1.0 , pd .NaT , 'A' ),
86
- ('A' , pd .Timestamp ("2012-01-01" ))]:
87
- result = hash_tuple (tup )
88
- expected = hash_tuples ([tup ])[0 ]
89
- assert result == expected
90
-
91
- def test_hash_scalar (self ):
92
- for val in [1 , 1.4 , 'A' , b'A' , u'A' , pd .Timestamp ("2012-01-01" ),
93
- pd .Timestamp ("2012-01-01" , tz = 'Europe/Brussels' ),
94
- datetime .datetime (2012 , 1 , 1 ),
95
- pd .Timestamp ("2012-01-01" , tz = 'EST' ).to_pydatetime (),
96
- pd .Timedelta ('1 days' ), datetime .timedelta (1 ),
97
- pd .Period ('2012-01-01' , freq = 'D' ), pd .Interval (0 , 1 ),
98
- np .nan , pd .NaT , None ]:
99
- result = _hash_scalar (val )
100
- expected = hash_array (np .array ([val ], dtype = object ),
101
- categorize = True )
102
- assert result [0 ] == expected [0 ]
103
-
104
- def test_hash_tuples_err (self ):
105
-
106
- for val in [5 , 'foo' , pd .Timestamp ('20130101' )]:
107
- pytest .raises (TypeError , hash_tuples , val )
88
+ result = hash_tuple (tup )
89
+ expected = hash_tuples ([tup ])[0 ]
90
+ assert result == expected
91
+
92
+ @pytest .mark .parametrize ('val' , [
93
+ 1 , 1.4 , 'A' , b'A' , u'A' , pd .Timestamp ("2012-01-01" ),
94
+ pd .Timestamp ("2012-01-01" , tz = 'Europe/Brussels' ),
95
+ datetime .datetime (2012 , 1 , 1 ),
96
+ pd .Timestamp ("2012-01-01" , tz = 'EST' ).to_pydatetime (),
97
+ pd .Timedelta ('1 days' ), datetime .timedelta (1 ),
98
+ pd .Period ('2012-01-01' , freq = 'D' ), pd .Interval (0 , 1 ),
99
+ np .nan , pd .NaT , None ])
100
+ def test_hash_scalar (self , val ):
101
+ result = _hash_scalar (val )
102
+ expected = hash_array (np .array ([val ], dtype = object ), categorize = True )
103
+ assert result [0 ] == expected [0 ]
104
+
105
+ @pytest .mark .parametrize ('val' , [5 , 'foo' , pd .Timestamp ('20130101' )])
106
+ def test_hash_tuples_err (self , val ):
107
+ msg = 'must be convertible to a list-of-tuples'
108
+ with tm .assert_raises_regex (TypeError , msg ):
109
+ hash_tuples (val )
108
110
109
111
def test_multiindex_unique (self ):
110
112
mi = MultiIndex .from_tuples ([(118 , 472 ), (236 , 118 ),
@@ -172,36 +174,35 @@ def test_hash_pandas_object(self, obj):
172
174
self .check_equal (obj )
173
175
self .check_not_equal_with_index (obj )
174
176
175
- def test_hash_pandas_object2 (self ):
176
- for name , s in self .df .iteritems ():
177
- self .check_equal (s )
178
- self .check_not_equal_with_index (s )
179
-
180
- def test_hash_pandas_empty_object (self ):
181
- for obj in [Series ([], dtype = 'float64' ),
182
- Series ([], dtype = 'object' ),
183
- Index ([])]:
184
- self .check_equal (obj )
177
+ def test_hash_pandas_object2 (self , series ):
178
+ self .check_equal (series )
179
+ self .check_not_equal_with_index (series )
185
180
186
- # these are by-definition the same with
187
- # or w/o the index as the data is empty
181
+ @pytest .mark .parametrize ('obj' , [
182
+ Series ([], dtype = 'float64' ), Series ([], dtype = 'object' ), Index ([])])
183
+ def test_hash_pandas_empty_object (self , obj ):
184
+ # these are by-definition the same with
185
+ # or w/o the index as the data is empty
186
+ self .check_equal (obj )
188
187
189
- def test_categorical_consistency (self ):
188
+ @pytest .mark .parametrize ('s1' , [
189
+ Series (['a' , 'b' , 'c' , 'd' ]),
190
+ Series ([1000 , 2000 , 3000 , 4000 ]),
191
+ Series (pd .date_range (0 , periods = 4 ))])
192
+ @pytest .mark .parametrize ('categorize' , [True , False ])
193
+ def test_categorical_consistency (self , s1 , categorize ):
190
194
# GH15143
191
195
# Check that categoricals hash consistent with their values, not codes
192
196
# This should work for categoricals of any dtype
193
- for s1 in [Series (['a' , 'b' , 'c' , 'd' ]),
194
- Series ([1000 , 2000 , 3000 , 4000 ]),
195
- Series (pd .date_range (0 , periods = 4 ))]:
196
- s2 = s1 .astype ('category' ).cat .set_categories (s1 )
197
- s3 = s2 .cat .set_categories (list (reversed (s1 )))
198
- for categorize in [True , False ]:
199
- # These should all hash identically
200
- h1 = hash_pandas_object (s1 , categorize = categorize )
201
- h2 = hash_pandas_object (s2 , categorize = categorize )
202
- h3 = hash_pandas_object (s3 , categorize = categorize )
203
- tm .assert_series_equal (h1 , h2 )
204
- tm .assert_series_equal (h1 , h3 )
197
+ s2 = s1 .astype ('category' ).cat .set_categories (s1 )
198
+ s3 = s2 .cat .set_categories (list (reversed (s1 )))
199
+
200
+ # These should all hash identically
201
+ h1 = hash_pandas_object (s1 , categorize = categorize )
202
+ h2 = hash_pandas_object (s2 , categorize = categorize )
203
+ h3 = hash_pandas_object (s3 , categorize = categorize )
204
+ tm .assert_series_equal (h1 , h2 )
205
+ tm .assert_series_equal (h1 , h3 )
205
206
206
207
def test_categorical_with_nan_consistency (self ):
207
208
c = pd .Categorical .from_codes (
@@ -216,13 +217,12 @@ def test_categorical_with_nan_consistency(self):
216
217
assert result [1 ] in expected
217
218
218
219
def test_pandas_errors (self ):
219
-
220
- for obj in [pd .Timestamp ('20130101' )]:
221
- with pytest .raises (TypeError ):
222
- hash_pandas_object (obj )
220
+ with pytest .raises (TypeError ):
221
+ hash_pandas_object (pd .Timestamp ('20130101' ))
223
222
224
223
with catch_warnings (record = True ):
225
224
obj = tm .makePanel ()
225
+
226
226
with pytest .raises (TypeError ):
227
227
hash_pandas_object (obj )
228
228
@@ -238,9 +238,9 @@ def test_hash_keys(self):
238
238
239
239
def test_invalid_key (self ):
240
240
# this only matters for object dtypes
241
- def f ():
241
+ msg = 'key should be a 16-byte string encoded'
242
+ with tm .assert_raises_regex (ValueError , msg ):
242
243
hash_pandas_object (Series (list ('abc' )), hash_key = 'foo' )
243
- pytest .raises (ValueError , f )
244
244
245
245
def test_alread_encoded (self ):
246
246
# if already encoded then ok
@@ -253,19 +253,13 @@ def test_alternate_encoding(self):
253
253
obj = Series (list ('abc' ))
254
254
self .check_equal (obj , encoding = 'ascii' )
255
255
256
- def test_same_len_hash_collisions (self ):
257
-
258
- for l in range (8 ):
259
- length = 2 ** (l + 8 ) + 1
260
- s = tm .rands_array (length , 2 )
261
- result = hash_array (s , 'utf8' )
262
- assert not result [0 ] == result [1 ]
263
-
264
- for l in range (8 ):
265
- length = 2 ** (l + 8 )
266
- s = tm .rands_array (length , 2 )
267
- result = hash_array (s , 'utf8' )
268
- assert not result [0 ] == result [1 ]
256
+ @pytest .mark .parametrize ('l_exp' , range (8 ))
257
+ @pytest .mark .parametrize ('l_add' , [0 , 1 ])
258
+ def test_same_len_hash_collisions (self , l_exp , l_add ):
259
+ length = 2 ** (l_exp + 8 ) + l_add
260
+ s = tm .rands_array (length , 2 )
261
+ result = hash_array (s , 'utf8' )
262
+ assert not result [0 ] == result [1 ]
269
263
270
264
def test_hash_collisions (self ):
271
265
0 commit comments