Skip to content

Commit 9a5a5d4

Browse files
committed
ERR: raise on python in object hashing, only supporting
strings, nulls xref #14729
1 parent 423c16a commit 9a5a5d4

File tree

2 files changed

+28
-13
lines changed

2 files changed

+28
-13
lines changed

pandas/src/hash.pyx

+12-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ cimport numpy as cnp
77
import numpy as np
88
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t
99

10+
from util cimport _checknull
1011
from cpython cimport (PyString_Check,
1112
PyBytes_Check,
1213
PyUnicode_Check)
@@ -29,6 +30,11 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
2930
-------
3031
1-d uint64 ndarray of hashes
3132
33+
Notes
34+
-----
35+
allowed values must be strings, or nulls
36+
mixed array types will raise TypeError
37+
3238
"""
3339
cdef:
3440
Py_ssize_t i, l, n
@@ -60,10 +66,14 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
6066
data = <bytes>val
6167
elif PyUnicode_Check(val):
6268
data = <bytes>val.encode(encoding)
63-
else:
64-
# non-strings
69+
elif _checknull(val):
70+
# null, stringify and encode
6571
data = <bytes>str(val).encode(encoding)
6672

73+
else:
74+
raise TypeError("{} of type {} is not a valid type for "
75+
"hashing, must be string or null".format(val, type(val)))
76+
6777
l = len(data)
6878
lens[i] = l
6979
cdata = data

pandas/tools/tests/test_hashing.py

+16-11
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def test_hash_pandas_object(self):
6363
Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
6464
Series(['a', 'b', 'c']),
6565
Series(['a', np.nan, 'c']),
66+
Series(['a', None, 'c']),
6667
Series([True, False, True]),
6768
Index([1, 2, 3]),
6869
Index([True, False, True]),
@@ -71,9 +72,7 @@ def test_hash_pandas_object(self):
7172
tm.makeMixedDataFrame(),
7273
tm.makeTimeDataFrame(),
7374
tm.makeTimeSeries(),
74-
tm.makeTimedeltaIndex(),
75-
Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
76-
[('a', 1), ('a', 2), ('b', 1)]))]:
75+
tm.makeTimedeltaIndex()]:
7776
self.check_equal(obj)
7877
self.check_not_equal_with_index(obj)
7978

@@ -115,16 +114,22 @@ def f():
115114
hash_pandas_object(Series(list('abc')), hash_key='foo')
116115
self.assertRaises(ValueError, f)
117116

118-
def test_mixed(self):
119-
# mixed objects
117+
def test_unsupported_objects(self):
118+
119+
# mixed objects are not supported
120120
obj = Series(['1', 2, 3])
121-
self.check_equal(obj)
122-
self.check_not_equal_with_index(obj)
123121

124-
# mixed are actually equal when stringified
125-
a = hash_pandas_object(obj)
126-
b = hash_pandas_object(Series(list('123')))
127-
self.assert_series_equal(a, b)
122+
def f():
123+
hash_pandas_object(obj)
124+
self.assertRaises(TypeError, f)
125+
126+
# MultiIndex are represented as tuples
127+
obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
128+
[('a', 1), ('a', 2), ('b', 1)]))
129+
130+
def f():
131+
hash_pandas_object(obj)
132+
self.assertRaises(TypeError, f)
128133

129134
def test_alread_encoded(self):
130135
# if already encoded then ok

0 commit comments

Comments
 (0)