diff --git a/pandas/src/hash.pyx b/pandas/src/hash.pyx index b8c309f1f7a13..6c0c7804edd05 100644 --- a/pandas/src/hash.pyx +++ b/pandas/src/hash.pyx @@ -7,6 +7,7 @@ cimport numpy as cnp import numpy as np from numpy cimport ndarray, uint8_t, uint32_t, uint64_t +from util cimport _checknull from cpython cimport (PyString_Check, PyBytes_Check, PyUnicode_Check) @@ -29,6 +30,11 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): ------- 1-d uint64 ndarray of hashes + Notes + ----- + allowed values must be strings, or nulls + mixed array types will raise TypeError + """ cdef: Py_ssize_t i, l, n @@ -60,10 +66,14 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): data = val elif PyUnicode_Check(val): data = val.encode(encoding) - else: - # non-strings + elif _checknull(val): + # null, stringify and encode data = str(val).encode(encoding) + else: + raise TypeError("{} of type {} is not a valid type for " + "hashing, must be string or null".format(val, type(val))) + l = len(data) lens[i] = l cdata = data diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 3e4c77244d2f7..4e05ae7007c80 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -63,6 +63,7 @@ def test_hash_pandas_object(self): Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), Series(['a', 'b', 'c']), Series(['a', np.nan, 'c']), + Series(['a', None, 'c']), Series([True, False, True]), Index([1, 2, 3]), Index([True, False, True]), @@ -71,9 +72,7 @@ def test_hash_pandas_object(self): tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), - tm.makeTimedeltaIndex(), - Series([1, 2, 3], index=pd.MultiIndex.from_tuples( - [('a', 1), ('a', 2), ('b', 1)]))]: + tm.makeTimedeltaIndex()]: self.check_equal(obj) self.check_not_equal_with_index(obj) @@ -115,16 +114,22 @@ def f(): hash_pandas_object(Series(list('abc')), hash_key='foo') self.assertRaises(ValueError, f) - def test_mixed(self): - # mixed objects + def test_unsupported_objects(self): + + # mixed objects are not supported obj = Series(['1', 2, 3]) - self.check_equal(obj) - self.check_not_equal_with_index(obj) - # mixed are actually equal when stringified - a = hash_pandas_object(obj) - b = hash_pandas_object(Series(list('123'))) - self.assert_series_equal(a, b) + def f(): + hash_pandas_object(obj) + self.assertRaises(TypeError, f) + + # MultiIndex are represented as tuples + obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples( + [('a', 1), ('a', 2), ('b', 1)])) + + def f(): + hash_pandas_object(obj) + self.assertRaises(TypeError, f) def test_alread_encoded(self): # if already encoded then ok