From 77f7ac87b9b71f132d9a36a6ecd789698f446db7 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 29 Nov 2018 22:04:16 -0800 Subject: [PATCH 1/2] REF: Convert test_hashing to pytest idiom --- pandas/tests/util/test_hashing.py | 586 ++++++++++++++++-------------- 1 file changed, 317 insertions(+), 269 deletions(-) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 9f5b4f7b90d9f..53aeebc53e338 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -10,272 +10,320 @@ import pandas.util.testing as tm -class TestHashing(object): - - @pytest.fixture(params=[ - Series([1, 2, 3] * 3, dtype='int32'), - Series([None, 2.5, 3.5] * 3, dtype='float32'), - Series(['a', 'b', 'c'] * 3, dtype='category'), - Series(['d', 'e', 'f'] * 3), - Series([True, False, True] * 3), - Series(pd.date_range('20130101', periods=9)), - Series(pd.date_range('20130101', periods=9, tz='US/Eastern')), - Series(pd.timedelta_range('2000', periods=9))]) - def series(self, request): - return request.param - - def test_consistency(self): - # check that our hash doesn't change because of a mistake - # in the actual code; this is the ground truth - result = hash_pandas_object(Index(['foo', 'bar', 'baz'])) - expected = Series(np.array([3600424527151052760, 1374399572096150070, - 477881037637427054], dtype='uint64'), - index=['foo', 'bar', 'baz']) - tm.assert_series_equal(result, expected) - - def test_hash_array(self, series): - a = series.values - tm.assert_numpy_array_equal(hash_array(a), hash_array(a)) - - def test_hash_array_mixed(self): - result1 = hash_array(np.array([3, 4, 'All'])) - result2 = hash_array(np.array(['3', '4', 'All'])) - result3 = hash_array(np.array([3, 4, 'All'], dtype=object)) - tm.assert_numpy_array_equal(result1, result2) - tm.assert_numpy_array_equal(result1, result3) - - @pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')]) - def test_hash_array_errors(self, val): - msg = 'must pass a ndarray-like' - with pytest.raises(TypeError, match=msg): - hash_array(val) - - def check_equal(self, obj, **kwargs): - a = hash_pandas_object(obj, **kwargs) - b = hash_pandas_object(obj, **kwargs) - tm.assert_series_equal(a, b) - - kwargs.pop('index', None) - a = hash_pandas_object(obj, **kwargs) - b = hash_pandas_object(obj, **kwargs) - tm.assert_series_equal(a, b) - - def check_not_equal_with_index(self, obj): - - # check that we are not hashing the same if - # we include the index - if not isinstance(obj, Index): - a = hash_pandas_object(obj, index=True) - b = hash_pandas_object(obj, index=False) - if len(obj): - assert not (a == b).all() - - def test_hash_tuples(self): - tups = [(1, 'one'), (1, 'two'), (2, 'one')] - result = hash_tuples(tups) - expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values - tm.assert_numpy_array_equal(result, expected) - - result = hash_tuples(tups[0]) - assert result == expected[0] - - @pytest.mark.parametrize('tup', [ - (1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'), - ('A', pd.Timestamp("2012-01-01"))]) - def test_hash_tuple(self, tup): - # test equivalence between hash_tuples and hash_tuple - result = hash_tuple(tup) - expected = hash_tuples([tup])[0] - assert result == expected - - @pytest.mark.parametrize('val', [ - 1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"), - pd.Timestamp("2012-01-01", tz='Europe/Brussels'), - datetime.datetime(2012, 1, 1), - pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(), - pd.Timedelta('1 days'), datetime.timedelta(1), - pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1), - np.nan, pd.NaT, None]) - def test_hash_scalar(self, val): - result = _hash_scalar(val) - expected = hash_array(np.array([val], dtype=object), categorize=True) - assert result[0] == expected[0] - - @pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')]) - def test_hash_tuples_err(self, val): - msg = 'must be convertible to a list-of-tuples' - with pytest.raises(TypeError, match=msg): - hash_tuples(val) - - def test_multiindex_unique(self): - mi = MultiIndex.from_tuples([(118, 472), (236, 118), - (51, 204), (102, 51)]) - assert mi.is_unique is True - result = hash_pandas_object(mi) - assert result.is_unique is True - - def test_multiindex_objects(self): - mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], - labels=[[0, 1, 0, 2], [2, 0, 0, 1]], - names=['col1', 'col2']) - recons = mi._sort_levels_monotonic() - - # these are equal - assert mi.equals(recons) - assert Index(mi.values).equals(Index(recons.values)) - - # _hashed_values and hash_pandas_object(..., index=False) - # equivalency - expected = hash_pandas_object( - mi, index=False).values - result = mi._hashed_values - tm.assert_numpy_array_equal(result, expected) - - expected = hash_pandas_object( - recons, index=False).values - result = recons._hashed_values - tm.assert_numpy_array_equal(result, expected) - - expected = mi._hashed_values - result = recons._hashed_values - - # values should match, but in different order - tm.assert_numpy_array_equal(np.sort(result), - np.sort(expected)) - - @pytest.mark.parametrize('obj', [ - Series([1, 2, 3]), - Series([1.0, 1.5, 3.2]), - Series([1.0, 1.5, np.nan]), - Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), - Series(['a', 'b', 'c']), - Series(['a', np.nan, 'c']), - Series(['a', None, 'c']), - Series([True, False, True]), - Series(), - Index([1, 2, 3]), - Index([True, False, True]), - DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}), - DataFrame(), - tm.makeMissingDataframe(), - tm.makeMixedDataFrame(), - tm.makeTimeDataFrame(), - tm.makeTimeSeries(), - tm.makeTimedeltaIndex(), - tm.makePeriodIndex(), - Series(tm.makePeriodIndex()), - Series(pd.date_range('20130101', periods=3, tz='US/Eastern')), - MultiIndex.from_product([range(5), ['foo', 'bar', 'baz'], - pd.date_range('20130101', periods=2)]), - MultiIndex.from_product([pd.CategoricalIndex(list('aabc')), range(3)]) - ]) - def test_hash_pandas_object(self, obj): - self.check_equal(obj) - self.check_not_equal_with_index(obj) - - def test_hash_pandas_object2(self, series): - self.check_equal(series) - self.check_not_equal_with_index(series) - - @pytest.mark.parametrize('obj', [ - Series([], dtype='float64'), Series([], dtype='object'), Index([])]) - def test_hash_pandas_empty_object(self, obj): - # these are by-definition the same with - # or w/o the index as the data is empty - self.check_equal(obj) - - @pytest.mark.parametrize('s1', [ - Series(['a', 'b', 'c', 'd']), - Series([1000, 2000, 3000, 4000]), - Series(pd.date_range(0, periods=4))]) - @pytest.mark.parametrize('categorize', [True, False]) - def test_categorical_consistency(self, s1, categorize): - # GH15143 - # Check that categoricals hash consistent with their values, not codes - # This should work for categoricals of any dtype - s2 = s1.astype('category').cat.set_categories(s1) - s3 = s2.cat.set_categories(list(reversed(s1))) - - # These should all hash identically - h1 = hash_pandas_object(s1, categorize=categorize) - h2 = hash_pandas_object(s2, categorize=categorize) - h3 = hash_pandas_object(s3, categorize=categorize) - tm.assert_series_equal(h1, h2) - tm.assert_series_equal(h1, h3) - - def test_categorical_with_nan_consistency(self): - c = pd.Categorical.from_codes( - [-1, 0, 1, 2, 3, 4], - categories=pd.date_range('2012-01-01', periods=5, name='B')) - expected = hash_array(c, categorize=False) - c = pd.Categorical.from_codes( - [-1, 0], - categories=[pd.Timestamp('2012-01-01')]) - result = hash_array(c, categorize=False) - assert result[0] in expected - assert result[1] in expected - - @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") - def test_pandas_errors(self): - with pytest.raises(TypeError): - hash_pandas_object(pd.Timestamp('20130101')) - - obj = tm.makePanel() - - with pytest.raises(TypeError): - hash_pandas_object(obj) - - def test_hash_keys(self): - # using different hash keys, should have different hashes - # for the same data - - # this only matters for object dtypes - obj = Series(list('abc')) - a = hash_pandas_object(obj, hash_key='9876543210123456') - b = hash_pandas_object(obj, hash_key='9876543210123465') - assert (a != b).all() - - def test_invalid_key(self): - # this only matters for object dtypes - msg = 'key should be a 16-byte string encoded' - with pytest.raises(ValueError, match=msg): - hash_pandas_object(Series(list('abc')), hash_key='foo') - - def test_alread_encoded(self): - # if already encoded then ok - - obj = Series(list('abc')).str.encode('utf8') - self.check_equal(obj) - - def test_alternate_encoding(self): - - obj = Series(list('abc')) - self.check_equal(obj, encoding='ascii') - - @pytest.mark.parametrize('l_exp', range(8)) - @pytest.mark.parametrize('l_add', [0, 1]) - def test_same_len_hash_collisions(self, l_exp, l_add): - length = 2**(l_exp + 8) + l_add - s = tm.rands_array(length, 2) - result = hash_array(s, 'utf8') - assert not result[0] == result[1] - - def test_hash_collisions(self): - - # hash collisions are bad - # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726 - L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9', # noqa - 'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe'] # noqa - - # these should be different! - result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8') - expected1 = np.array([14963968704024874985], dtype=np.uint64) - tm.assert_numpy_array_equal(result1, expected1) - - result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8') - expected2 = np.array([16428432627716348016], dtype=np.uint64) - tm.assert_numpy_array_equal(result2, expected2) - - result = hash_array(np.asarray(L, dtype=object), 'utf8') - tm.assert_numpy_array_equal( - result, np.concatenate([expected1, expected2], axis=0)) +@pytest.fixture(params=[ + Series([1, 2, 3] * 3, dtype="int32"), + Series([None, 2.5, 3.5] * 3, dtype="float32"), + Series(["a", "b", "c"] * 3, dtype="category"), + Series(["d", "e", "f"] * 3), + Series([True, False, True] * 3), + Series(pd.date_range("20130101", periods=9)), + Series(pd.date_range("20130101", periods=9, tz="US/Eastern")), + Series(pd.timedelta_range("2000", periods=9))]) +def series(request): + return request.param + + +def _check_equal(obj, **kwargs): + """ + Check that hashing an objects produces the same value each time. + + Parameters + ---------- + obj : object + The object to hash. + kwargs : kwargs + Keyword arguments to pass to the hashing function. + """ + a = hash_pandas_object(obj, **kwargs) + b = hash_pandas_object(obj, **kwargs) + tm.assert_series_equal(a, b) + + kwargs.pop("index", None) + + a = hash_pandas_object(obj, **kwargs) + b = hash_pandas_object(obj, **kwargs) + tm.assert_series_equal(a, b) + + +def _check_not_equal_with_index(obj): + """ + Check the hash of an object with and without its index is not the same. + + Parameters + ---------- + obj : object + The object to hash. + """ + if not isinstance(obj, Index): + a = hash_pandas_object(obj, index=True) + b = hash_pandas_object(obj, index=False) + + if len(obj): + assert not (a == b).all() + + +def test_consistency(): + # Check that our hash doesn't change because of a mistake + # in the actual code; this is the ground truth. + result = hash_pandas_object(Index(["foo", "bar", "baz"])) + expected = Series(np.array([3600424527151052760, 1374399572096150070, + 477881037637427054], dtype="uint64"), + index=["foo", "bar", "baz"]) + tm.assert_series_equal(result, expected) + + +def test_hash_array(series): + arr = series.values + tm.assert_numpy_array_equal(hash_array(arr), hash_array(arr)) + + +@pytest.mark.parametrize("arr2", [ + np.array([3, 4, "All"]), + np.array([3, 4, "All"], dtype=object), +]) +def test_hash_array_mixed(arr2): + result1 = hash_array(np.array(["3", "4", "All"])) + result2 = hash_array(arr2) + + tm.assert_numpy_array_equal(result1, result2) + + +@pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")]) +def test_hash_array_errors(val): + msg = "must pass a ndarray-like" + with pytest.raises(TypeError, match=msg): + hash_array(val) + + +def test_hash_tuples(): + tuples = [(1, "one"), (1, "two"), (2, "one")] + result = hash_tuples(tuples) + + expected = hash_pandas_object(MultiIndex.from_tuples(tuples)).values + tm.assert_numpy_array_equal(result, expected) + + result = hash_tuples(tuples[0]) + assert result == expected[0] + + +@pytest.mark.parametrize("tup", [ + (1, "one"), (1, np.nan), (1.0, pd.NaT, "A"), + ("A", pd.Timestamp("2012-01-01"))]) +def test_hash_tuple(tup): + # Test equivalence between + # hash_tuples and hash_tuple. + result = hash_tuple(tup) + expected = hash_tuples([tup])[0] + + assert result == expected + + +@pytest.mark.parametrize("val", [ + 1, 1.4, "A", b"A", u"A", pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-01", tz="Europe/Brussels"), + datetime.datetime(2012, 1, 1), + pd.Timestamp("2012-01-01", tz="EST").to_pydatetime(), + pd.Timedelta("1 days"), datetime.timedelta(1), + pd.Period("2012-01-01", freq="D"), pd.Interval(0, 1), + np.nan, pd.NaT, None]) +def test_hash_scalar(val): + result = _hash_scalar(val) + expected = hash_array(np.array([val], dtype=object), categorize=True) + + assert result[0] == expected[0] + + +@pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")]) +def test_hash_tuples_err(val): + msg = "must be convertible to a list-of-tuples" + with pytest.raises(TypeError, match=msg): + hash_tuples(val) + + +def test_multiindex_unique(): + mi = MultiIndex.from_tuples([(118, 472), (236, 118), + (51, 204), (102, 51)]) + assert mi.is_unique is True + + result = hash_pandas_object(mi) + assert result.is_unique is True + + +def test_multiindex_objects(): + mi = MultiIndex(levels=[["b", "d", "a"], [1, 2, 3]], + labels=[[0, 1, 0, 2], [2, 0, 0, 1]], + names=["col1", "col2"]) + recons = mi._sort_levels_monotonic() + + # These are equal. + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + # _hashed_values and hash_pandas_object(..., index=False) equivalency. + expected = hash_pandas_object(mi, index=False).values + result = mi._hashed_values + + tm.assert_numpy_array_equal(result, expected) + + expected = hash_pandas_object(recons, index=False).values + result = recons._hashed_values + + tm.assert_numpy_array_equal(result, expected) + + expected = mi._hashed_values + result = recons._hashed_values + + # Values should match, but in different order. + tm.assert_numpy_array_equal(np.sort(result), np.sort(expected)) + + +@pytest.mark.parametrize("obj", [ + Series([1, 2, 3]), + Series([1.0, 1.5, 3.2]), + Series([1.0, 1.5, np.nan]), + Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), + Series(["a", "b", "c"]), + Series(["a", np.nan, "c"]), + Series(["a", None, "c"]), + Series([True, False, True]), + Series(), + Index([1, 2, 3]), + Index([True, False, True]), + DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), + DataFrame(), + tm.makeMissingDataframe(), + tm.makeMixedDataFrame(), + tm.makeTimeDataFrame(), + tm.makeTimeSeries(), + tm.makeTimedeltaIndex(), + tm.makePeriodIndex(), + Series(tm.makePeriodIndex()), + Series(pd.date_range("20130101", periods=3, tz="US/Eastern")), + MultiIndex.from_product([range(5), ["foo", "bar", "baz"], + pd.date_range("20130101", periods=2)]), + MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]) +]) +def test_hash_pandas_object(obj): + _check_equal(obj) + _check_not_equal_with_index(obj) + + +def test_hash_pandas_object2(series): + _check_equal(series) + _check_not_equal_with_index(series) + + +@pytest.mark.parametrize("obj", [ + Series([], dtype="float64"), Series([], dtype="object"), Index([])]) +def test_hash_pandas_empty_object(obj): + # These are by-definition the same with + # or without the index as the data is empty. + _check_equal(obj) + + +@pytest.mark.parametrize("s1", [ + Series(["a", "b", "c", "d"]), + Series([1000, 2000, 3000, 4000]), + Series(pd.date_range(0, periods=4))]) +@pytest.mark.parametrize("categorize", [True, False]) +def test_categorical_consistency(s1, categorize): + # see gh-15143 + # + # Check that categoricals hash consistent with their values, + # not codes. This should work for categoricals of any dtype. + s2 = s1.astype("category").cat.set_categories(s1) + s3 = s2.cat.set_categories(list(reversed(s1))) + + # These should all hash identically. + h1 = hash_pandas_object(s1, categorize=categorize) + h2 = hash_pandas_object(s2, categorize=categorize) + h3 = hash_pandas_object(s3, categorize=categorize) + + tm.assert_series_equal(h1, h2) + tm.assert_series_equal(h1, h3) + + +def test_categorical_with_nan_consistency(): + c = pd.Categorical.from_codes( + [-1, 0, 1, 2, 3, 4], + categories=pd.date_range("2012-01-01", periods=5, name="B")) + expected = hash_array(c, categorize=False) + + c = pd.Categorical.from_codes( + [-1, 0], + categories=[pd.Timestamp("2012-01-01")]) + result = hash_array(c, categorize=False) + + assert result[0] in expected + assert result[1] in expected + + +@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") +@pytest.mark.parametrize("obj", [pd.Timestamp("20130101"), tm.makePanel()]) +def test_pandas_errors(obj): + msg = "Unexpected type for hashing" + with pytest.raises(TypeError, match=msg): + hash_pandas_object(obj) + + +def test_hash_keys(): + # Using different hash keys, should have + # different hashes for the same data. + # + # This only matters for object dtypes. + obj = Series(list("abc")) + + a = hash_pandas_object(obj, hash_key="9876543210123456") + b = hash_pandas_object(obj, hash_key="9876543210123465") + + assert (a != b).all() + + +def test_invalid_key(): + # This only matters for object dtypes. + msg = "key should be a 16-byte string encoded" + + with pytest.raises(ValueError, match=msg): + hash_pandas_object(Series(list("abc")), hash_key="foo") + + +def test_already_encoded(): + # If already encoded, then ok. + obj = Series(list("abc")).str.encode("utf8") + _check_equal(obj) + + +def test_alternate_encoding(): + obj = Series(list("abc")) + _check_equal(obj, encoding="ascii") + + +@pytest.mark.parametrize("l_exp", range(8)) +@pytest.mark.parametrize("l_add", [0, 1]) +def test_same_len_hash_collisions(l_exp, l_add): + length = 2**(l_exp + 8) + l_add + s = tm.rands_array(length, 2) + + result = hash_array(s, "utf8") + assert not result[0] == result[1] + + +def test_hash_collisions(): + # Hash collisions are bad. + # + # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726 + hashes = ["Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9", # noqa + "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe"] # noqa + + # These should be different. + result1 = hash_array(np.asarray(hashes[0:1], dtype=object), "utf8") + expected1 = np.array([14963968704024874985], dtype=np.uint64) + tm.assert_numpy_array_equal(result1, expected1) + + result2 = hash_array(np.asarray(hashes[1:2], dtype=object), "utf8") + expected2 = np.array([16428432627716348016], dtype=np.uint64) + tm.assert_numpy_array_equal(result2, expected2) + + result = hash_array(np.asarray(hashes, dtype=object), "utf8") + tm.assert_numpy_array_equal(result, np.concatenate([expected1, + expected2], axis=0)) From dfa24a81337f0af31d45c411bcb7950ecc0d441e Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 30 Nov 2018 10:53:38 -0800 Subject: [PATCH 2/2] BUG: Actually check index=True/False in some cases _check_equal appeared to be checking index=True/False, but we actually need to do explicitly pass in index=True/False for this to work. --- pandas/tests/util/test_hashing.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 53aeebc53e338..84bc1863aadd9 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -23,6 +23,11 @@ def series(request): return request.param +@pytest.fixture(params=[True, False]) +def index(request): + return request.param + + def _check_equal(obj, **kwargs): """ Check that hashing an objects produces the same value each time. @@ -38,12 +43,6 @@ def _check_equal(obj, **kwargs): b = hash_pandas_object(obj, **kwargs) tm.assert_series_equal(a, b) - kwargs.pop("index", None) - - a = hash_pandas_object(obj, **kwargs) - b = hash_pandas_object(obj, **kwargs) - tm.assert_series_equal(a, b) - def _check_not_equal_with_index(obj): """ @@ -203,22 +202,22 @@ def test_multiindex_objects(): pd.date_range("20130101", periods=2)]), MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]) ]) -def test_hash_pandas_object(obj): - _check_equal(obj) +def test_hash_pandas_object(obj, index): + _check_equal(obj, index=index) _check_not_equal_with_index(obj) -def test_hash_pandas_object2(series): - _check_equal(series) +def test_hash_pandas_object2(series, index): + _check_equal(series, index=index) _check_not_equal_with_index(series) @pytest.mark.parametrize("obj", [ Series([], dtype="float64"), Series([], dtype="object"), Index([])]) -def test_hash_pandas_empty_object(obj): +def test_hash_pandas_empty_object(obj, index): # These are by-definition the same with # or without the index as the data is empty. - _check_equal(obj) + _check_equal(obj, index=index) @pytest.mark.parametrize("s1", [ @@ -287,15 +286,15 @@ def test_invalid_key(): hash_pandas_object(Series(list("abc")), hash_key="foo") -def test_already_encoded(): +def test_already_encoded(index): # If already encoded, then ok. obj = Series(list("abc")).str.encode("utf8") - _check_equal(obj) + _check_equal(obj, index=index) -def test_alternate_encoding(): +def test_alternate_encoding(index): obj = Series(list("abc")) - _check_equal(obj, encoding="ascii") + _check_equal(obj, index=index, encoding="ascii") @pytest.mark.parametrize("l_exp", range(8))