diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 09bd09b06d9b9..ca46f94752731 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1165,3 +1165,4 @@ Other - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) - Bug in accessing a :func:`pandas.get_option`, which raised ``KeyError`` rather than ``OptionError`` when looking up a non-existant option key in some cases (:issue:`19789`) +- Bug in :func:`assert_series_equal` and :func:`assert_frame_equal` for Series or DataFrames with differing unicode data (:issue:`20503`) diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index 1c878604b11a2..d6f58d16bcf64 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -290,6 +290,24 @@ def test_numpy_array_equal_message(self): assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5]), obj='Index') + def test_numpy_array_equal_unicode_message(self): + # Test ensures that `assert_numpy_array_equals` raises the right + # exception when comparing np.arrays containing differing + # unicode objects (#20503) + + expected = """numpy array are different + +numpy array values are different \\(33\\.33333 %\\) +\\[left\\]: \\[á, à, ä\\] +\\[right\\]: \\[á, à, å\\]""" + + with tm.assert_raises_regex(AssertionError, expected): + assert_numpy_array_equal(np.array([u'á', u'à', u'ä']), + np.array([u'á', u'à', u'å'])) + with tm.assert_raises_regex(AssertionError, expected): + assert_almost_equal(np.array([u'á', u'à', u'ä']), + np.array([u'á', u'à', u'å'])) + @td.skip_if_windows def test_numpy_array_equal_object_message(self): @@ -499,10 +517,13 @@ def _assert_not_equal(self, a, b, **kwargs): def test_equal(self): self._assert_equal(Series(range(3)), Series(range(3))) self._assert_equal(Series(list('abc')), Series(list('abc'))) + self._assert_equal(Series(list(u'áàä')), Series(list(u'áàä'))) def test_not_equal(self): self._assert_not_equal(Series(range(3)), Series(range(3)) + 1) self._assert_not_equal(Series(list('abc')), Series(list('xyz'))) + self._assert_not_equal(Series(list(u'áàä')), Series(list(u'éèë'))) + self._assert_not_equal(Series(list(u'áàä')), Series(list(b'aaa'))) self._assert_not_equal(Series(range(3)), Series(range(4))) self._assert_not_equal( Series(range(3)), Series( @@ -678,6 +699,49 @@ def test_frame_equal_message(self): pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 7]}), by_blocks=True) + def test_frame_equal_message_unicode(self): + # Test ensures that `assert_frame_equals` raises the right + # exception when comparing DataFrames containing differing + # unicode objects (#20503) + + expected = """DataFrame\\.iloc\\[:, 1\\] are different + +DataFrame\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) +\\[left\\]: \\[é, è, ë\\] +\\[right\\]: \\[é, è, e̊\\]""" + + with tm.assert_raises_regex(AssertionError, expected): + assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'], + 'E': [u'é', u'è', u'ë']}), + pd.DataFrame({'A': [u'á', u'à', u'ä'], + 'E': [u'é', u'è', u'e̊']})) + + with tm.assert_raises_regex(AssertionError, expected): + assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'], + 'E': [u'é', u'è', u'ë']}), + pd.DataFrame({'A': [u'á', u'à', u'ä'], + 'E': [u'é', u'è', u'e̊']}), + by_blocks=True) + + expected = """DataFrame\\.iloc\\[:, 0\\] are different + +DataFrame\\.iloc\\[:, 0\\] values are different \\(100\\.0 %\\) +\\[left\\]: \\[á, à, ä\\] +\\[right\\]: \\[a, a, a\\]""" + + with tm.assert_raises_regex(AssertionError, expected): + assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'], + 'E': [u'é', u'è', u'ë']}), + pd.DataFrame({'A': ['a', 'a', 'a'], + 'E': ['e', 'e', 'e']})) + + with tm.assert_raises_regex(AssertionError, expected): + assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'], + 'E': [u'é', u'è', u'ë']}), + pd.DataFrame({'A': ['a', 'a', 'a'], + 'E': ['e', 'e', 'e']}), + by_blocks=True) + class TestAssertCategoricalEqual(object): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 6e13a17eba68c..e1484a9c1b390 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -38,7 +38,7 @@ import pandas.compat as compat from pandas.compat import ( filter, map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter, - raise_with_traceback, httplib, StringIO, PY3) + raise_with_traceback, httplib, StringIO, string_types, PY3, PY2) from pandas import (bdate_range, CategoricalIndex, Categorical, IntervalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, RangeIndex, @@ -992,11 +992,20 @@ def raise_assert_detail(obj, message, left, right, diff=None): left = pprint_thing(left) elif is_categorical_dtype(left): left = repr(left) + + if PY2 and isinstance(left, string_types): + # left needs to be printable in native text type in python2 + left = left.encode('utf-8') + if isinstance(right, np.ndarray): right = pprint_thing(right) elif is_categorical_dtype(right): right = repr(right) + if PY2 and isinstance(right, string_types): + # right needs to be printable in native text type in python2 + right = right.encode('utf-8') + msg = """{obj} are different {message}