Skip to content

Safely raise errors when object contains unicode #20593

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1165,3 +1165,4 @@ Other

- Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`)
- Bug in accessing a :func:`pandas.get_option`, which raised ``KeyError`` rather than ``OptionError`` when looking up a non-existant option key in some cases (:issue:`19789`)
- Bug in :func:`assert_series_equal` and :func:`assert_frame_equal` for Series or DataFrames with differing unicode data (:issue:`20503`)
64 changes: 64 additions & 0 deletions pandas/tests/util/test_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,24 @@ def test_numpy_array_equal_message(self):
assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5]),
obj='Index')

def test_numpy_array_equal_unicode_message(self):
# Test ensures that `assert_numpy_array_equals` raises the right
# exception when comparing np.arrays containing differing
# unicode objects (#20503)

expected = """numpy array are different

numpy array values are different \\(33\\.33333 %\\)
\\[left\\]: \\[á, à, ä\\]
\\[right\\]: \\[á, à, å\\]"""

with tm.assert_raises_regex(AssertionError, expected):
assert_numpy_array_equal(np.array([u'á', u'à', u'ä']),
np.array([u'á', u'à', u'å']))
with tm.assert_raises_regex(AssertionError, expected):
assert_almost_equal(np.array([u'á', u'à', u'ä']),
np.array([u'á', u'à', u'å']))

@td.skip_if_windows
def test_numpy_array_equal_object_message(self):

Expand Down Expand Up @@ -499,10 +517,13 @@ def _assert_not_equal(self, a, b, **kwargs):
def test_equal(self):
self._assert_equal(Series(range(3)), Series(range(3)))
self._assert_equal(Series(list('abc')), Series(list('abc')))
self._assert_equal(Series(list(u'áàä')), Series(list(u'áàä')))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a test where left is unicode and right is non-unicode (but string)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 1f7e231


def test_not_equal(self):
self._assert_not_equal(Series(range(3)), Series(range(3)) + 1)
self._assert_not_equal(Series(list('abc')), Series(list('xyz')))
self._assert_not_equal(Series(list(u'áàä')), Series(list(u'éèë')))
self._assert_not_equal(Series(list(u'áàä')), Series(list(b'aaa')))
self._assert_not_equal(Series(range(3)), Series(range(4)))
self._assert_not_equal(
Series(range(3)), Series(
Expand Down Expand Up @@ -678,6 +699,49 @@ def test_frame_equal_message(self):
pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 7]}),
by_blocks=True)

def test_frame_equal_message_unicode(self):
# Test ensures that `assert_frame_equals` raises the right
# exception when comparing DataFrames containing differing
# unicode objects (#20503)

expected = """DataFrame\\.iloc\\[:, 1\\] are different

DataFrame\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\)
\\[left\\]: \\[é, è, ë\\]
\\[right\\]: \\[é, è, e̊\\]"""

with tm.assert_raises_regex(AssertionError, expected):
assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'],
'E': [u'é', u'è', u'ë']}),
pd.DataFrame({'A': [u'á', u'à', u'ä'],
'E': [u'é', u'è', u'e̊']}))

with tm.assert_raises_regex(AssertionError, expected):
assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'],
'E': [u'é', u'è', u'ë']}),
pd.DataFrame({'A': [u'á', u'à', u'ä'],
'E': [u'é', u'è', u'e̊']}),
by_blocks=True)

expected = """DataFrame\\.iloc\\[:, 0\\] are different

DataFrame\\.iloc\\[:, 0\\] values are different \\(100\\.0 %\\)
\\[left\\]: \\[á, à, ä\\]
\\[right\\]: \\[a, a, a\\]"""

with tm.assert_raises_regex(AssertionError, expected):
assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'],
'E': [u'é', u'è', u'ë']}),
pd.DataFrame({'A': ['a', 'a', 'a'],
'E': ['e', 'e', 'e']}))

with tm.assert_raises_regex(AssertionError, expected):
assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'],
'E': [u'é', u'è', u'ë']}),
pd.DataFrame({'A': ['a', 'a', 'a'],
'E': ['e', 'e', 'e']}),
by_blocks=True)


class TestAssertCategoricalEqual(object):

Expand Down
11 changes: 10 additions & 1 deletion pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
import pandas.compat as compat
from pandas.compat import (
filter, map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter,
raise_with_traceback, httplib, StringIO, PY3)
raise_with_traceback, httplib, StringIO, string_types, PY3, PY2)

from pandas import (bdate_range, CategoricalIndex, Categorical, IntervalIndex,
DatetimeIndex, TimedeltaIndex, PeriodIndex, RangeIndex,
Expand Down Expand Up @@ -992,11 +992,20 @@ def raise_assert_detail(obj, message, left, right, diff=None):
left = pprint_thing(left)
elif is_categorical_dtype(left):
left = repr(left)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you import PY2 and string_types up top

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍575b2e8

if PY2 and isinstance(left, string_types):
# left needs to be printable in native text type in python2
left = left.encode('utf-8')

if isinstance(right, np.ndarray):
right = pprint_thing(right)
elif is_categorical_dtype(right):
right = repr(right)

if PY2 and isinstance(right, string_types):
# right needs to be printable in native text type in python2
right = right.encode('utf-8')

msg = """{obj} are different

{message}
Expand Down