Skip to content

Commit e8f206d

Browse files
janritojreback
authored andcommitted
Safely raise errors when object contains unicode (#20593)
1 parent 2431641 commit e8f206d

File tree

3 files changed

+75
-1
lines changed

3 files changed

+75
-1
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1170,3 +1170,4 @@ Other
11701170

11711171
- Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`)
11721172
- Bug in accessing a :func:`pandas.get_option`, which raised ``KeyError`` rather than ``OptionError`` when looking up a non-existant option key in some cases (:issue:`19789`)
1173+
- Bug in :func:`assert_series_equal` and :func:`assert_frame_equal` for Series or DataFrames with differing unicode data (:issue:`20503`)

pandas/tests/util/test_testing.py

+64
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,24 @@ def test_numpy_array_equal_message(self):
290290
assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5]),
291291
obj='Index')
292292

293+
def test_numpy_array_equal_unicode_message(self):
294+
# Test ensures that `assert_numpy_array_equals` raises the right
295+
# exception when comparing np.arrays containing differing
296+
# unicode objects (#20503)
297+
298+
expected = """numpy array are different
299+
300+
numpy array values are different \\(33\\.33333 %\\)
301+
\\[left\\]: \\[á, à, ä\\]
302+
\\[right\\]: \\[á, à, å\\]"""
303+
304+
with tm.assert_raises_regex(AssertionError, expected):
305+
assert_numpy_array_equal(np.array([u'á', u'à', u'ä']),
306+
np.array([u'á', u'à', u'å']))
307+
with tm.assert_raises_regex(AssertionError, expected):
308+
assert_almost_equal(np.array([u'á', u'à', u'ä']),
309+
np.array([u'á', u'à', u'å']))
310+
293311
@td.skip_if_windows
294312
def test_numpy_array_equal_object_message(self):
295313

@@ -499,10 +517,13 @@ def _assert_not_equal(self, a, b, **kwargs):
499517
def test_equal(self):
500518
self._assert_equal(Series(range(3)), Series(range(3)))
501519
self._assert_equal(Series(list('abc')), Series(list('abc')))
520+
self._assert_equal(Series(list(u'áàä')), Series(list(u'áàä')))
502521

503522
def test_not_equal(self):
504523
self._assert_not_equal(Series(range(3)), Series(range(3)) + 1)
505524
self._assert_not_equal(Series(list('abc')), Series(list('xyz')))
525+
self._assert_not_equal(Series(list(u'áàä')), Series(list(u'éèë')))
526+
self._assert_not_equal(Series(list(u'áàä')), Series(list(b'aaa')))
506527
self._assert_not_equal(Series(range(3)), Series(range(4)))
507528
self._assert_not_equal(
508529
Series(range(3)), Series(
@@ -678,6 +699,49 @@ def test_frame_equal_message(self):
678699
pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 7]}),
679700
by_blocks=True)
680701

702+
def test_frame_equal_message_unicode(self):
703+
# Test ensures that `assert_frame_equals` raises the right
704+
# exception when comparing DataFrames containing differing
705+
# unicode objects (#20503)
706+
707+
expected = """DataFrame\\.iloc\\[:, 1\\] are different
708+
709+
DataFrame\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\)
710+
\\[left\\]: \\[é, è, ë\\]
711+
\\[right\\]: \\[é, è, e̊\\]"""
712+
713+
with tm.assert_raises_regex(AssertionError, expected):
714+
assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'],
715+
'E': [u'é', u'è', u'ë']}),
716+
pd.DataFrame({'A': [u'á', u'à', u'ä'],
717+
'E': [u'é', u'è', u'e̊']}))
718+
719+
with tm.assert_raises_regex(AssertionError, expected):
720+
assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'],
721+
'E': [u'é', u'è', u'ë']}),
722+
pd.DataFrame({'A': [u'á', u'à', u'ä'],
723+
'E': [u'é', u'è', u'e̊']}),
724+
by_blocks=True)
725+
726+
expected = """DataFrame\\.iloc\\[:, 0\\] are different
727+
728+
DataFrame\\.iloc\\[:, 0\\] values are different \\(100\\.0 %\\)
729+
\\[left\\]: \\[á, à, ä\\]
730+
\\[right\\]: \\[a, a, a\\]"""
731+
732+
with tm.assert_raises_regex(AssertionError, expected):
733+
assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'],
734+
'E': [u'é', u'è', u'ë']}),
735+
pd.DataFrame({'A': ['a', 'a', 'a'],
736+
'E': ['e', 'e', 'e']}))
737+
738+
with tm.assert_raises_regex(AssertionError, expected):
739+
assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'],
740+
'E': [u'é', u'è', u'ë']}),
741+
pd.DataFrame({'A': ['a', 'a', 'a'],
742+
'E': ['e', 'e', 'e']}),
743+
by_blocks=True)
744+
681745

682746
class TestAssertCategoricalEqual(object):
683747

pandas/util/testing.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
import pandas.compat as compat
3939
from pandas.compat import (
4040
filter, map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter,
41-
raise_with_traceback, httplib, StringIO, PY3)
41+
raise_with_traceback, httplib, StringIO, string_types, PY3, PY2)
4242

4343
from pandas import (bdate_range, CategoricalIndex, Categorical, IntervalIndex,
4444
DatetimeIndex, TimedeltaIndex, PeriodIndex, RangeIndex,
@@ -992,11 +992,20 @@ def raise_assert_detail(obj, message, left, right, diff=None):
992992
left = pprint_thing(left)
993993
elif is_categorical_dtype(left):
994994
left = repr(left)
995+
996+
if PY2 and isinstance(left, string_types):
997+
# left needs to be printable in native text type in python2
998+
left = left.encode('utf-8')
999+
9951000
if isinstance(right, np.ndarray):
9961001
right = pprint_thing(right)
9971002
elif is_categorical_dtype(right):
9981003
right = repr(right)
9991004

1005+
if PY2 and isinstance(right, string_types):
1006+
# right needs to be printable in native text type in python2
1007+
right = right.encode('utf-8')
1008+
10001009
msg = """{obj} are different
10011010
10021011
{message}

0 commit comments

Comments
 (0)