Skip to content

Safely raise errors when object contains unicode #20593

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1165,3 +1165,4 @@ Other

- Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`)
- Bug in accessing a :func:`pandas.get_option`, which raised ``KeyError`` rather than ``OptionError`` when looking up a non-existant option key in some cases (:issue:`19789`)
- Bug in :func:`assert_series_equal` and :func:`assert_frame_equal` for Series or DataFrames with differing unicode data (:issue:`20503`)
44 changes: 44 additions & 0 deletions pandas/tests/util/test_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,24 @@ def test_numpy_array_equal_message(self):
assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5]),
obj='Index')

def test_numpy_array_equal_unicode_message(self):
# Test ensures that `assert_numpy_array_equals` raises the right
# exception when comparing np.arrays containing differing
# unicode objects (#20503)

expected = """numpy array are different

numpy array values are different \\(33\\.33333 %\\)
\\[left\\]: \\[á, à, ä\\]
\\[right\\]: \\[á, à, å\\]"""

with tm.assert_raises_regex(AssertionError, expected):
assert_numpy_array_equal(np.array([u'á', u'à', u'ä']),
np.array([u'á', u'à', u'å']))
with tm.assert_raises_regex(AssertionError, expected):
assert_almost_equal(np.array([u'á', u'à', u'ä']),
np.array([u'á', u'à', u'å']))

@td.skip_if_windows
def test_numpy_array_equal_object_message(self):

Expand Down Expand Up @@ -499,10 +517,12 @@ def _assert_not_equal(self, a, b, **kwargs):
def test_equal(self):
self._assert_equal(Series(range(3)), Series(range(3)))
self._assert_equal(Series(list('abc')), Series(list('abc')))
self._assert_equal(Series(list(u'áàä')), Series(list(u'áàä')))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a test where left is unicode and right is non-unicode (but string)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 1f7e231


def test_not_equal(self):
self._assert_not_equal(Series(range(3)), Series(range(3)) + 1)
self._assert_not_equal(Series(list('abc')), Series(list('xyz')))
self._assert_not_equal(Series(list(u'áàä')), Series(list(u'éèë')))
self._assert_not_equal(Series(range(3)), Series(range(4)))
self._assert_not_equal(
Series(range(3)), Series(
Expand Down Expand Up @@ -678,6 +698,30 @@ def test_frame_equal_message(self):
pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 7]}),
by_blocks=True)

def test_frame_equal_message_unicode(self):
# Test ensures that `assert_frame_equals` raises the right
# exception when comparing DataFrames containing differing
# unicode objects (#20503)

expected = """DataFrame\\.iloc\\[:, 1\\] are different
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 6b087f2


DataFrame\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\)
\\[left\\]: \\[é, è, ë\\]
\\[right\\]: \\[é, è, e̊\\]"""

with tm.assert_raises_regex(AssertionError, expected):
assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'],
'E': [u'é', u'è', u'ë']}),
pd.DataFrame({'A': [u'á', u'à', u'ä'],
'E': [u'é', u'è', u'e̊']}))

with tm.assert_raises_regex(AssertionError, expected):
assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'],
'E': [u'é', u'è', u'ë']}),
pd.DataFrame({'A': [u'á', u'à', u'ä'],
'E': [u'é', u'è', u'e̊']}),
by_blocks=True)


class TestAssertCategoricalEqual(object):

Expand Down
7 changes: 7 additions & 0 deletions pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -992,11 +992,18 @@ def raise_assert_detail(obj, message, left, right, diff=None):
left = pprint_thing(left)
elif is_categorical_dtype(left):
left = repr(left)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you import PY2 and string_types up top

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍575b2e8

if compat.PY2 and isinstance(left, compat.string_types):
left = left.encode('utf-8')

if isinstance(right, np.ndarray):
right = pprint_thing(right)
elif is_categorical_dtype(right):
right = repr(right)

if compat.PY2 and isinstance(right, compat.string_types):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment on these.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 45d2b8e

right = right.encode('utf-8')

msg = """{obj} are different

{message}
Expand Down