From 9cd83ff2dd2c1defd16b321e41411ce112974108 Mon Sep 17 00:00:00 2001 From: Alejandro Giacometti Date: Tue, 3 Apr 2018 14:13:08 +0100 Subject: [PATCH 01/11] Safely raise errors when object contains unicode This safely turns nd.array objects that contain unicode into a representation that can be printed --- pandas/util/testing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 6e13a17eba68c..cb3cc0e1c6a76 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -989,11 +989,11 @@ def assert_categorical_equal(left, right, check_dtype=True, def raise_assert_detail(obj, message, left, right, diff=None): if isinstance(left, np.ndarray): - left = pprint_thing(left) + left = repr(pprint_thing(left)) elif is_categorical_dtype(left): left = repr(left) if isinstance(right, np.ndarray): - right = pprint_thing(right) + right = repr(pprint_thing(right)) elif is_categorical_dtype(right): right = repr(right) From 46cadc0b6d4ec7b1d969701f59bf0e0c9a885634 Mon Sep 17 00:00:00 2001 From: Alejandro Giacometti Date: Wed, 4 Apr 2018 11:35:22 +0100 Subject: [PATCH 02/11] Whatsnew entry --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 09bd09b06d9b9..6571c1d3690ad 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1165,3 +1165,4 @@ Other - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) - Bug in accessing a :func:`pandas.get_option`, which raised ``KeyError`` rather than ``OptionError`` when looking up a non-existant option key in some cases (:issue:`19789`) +- Bug in :func:`raise_assert_detail` for Series and DataFrames with differing unicode data (:issue:`20503`) From 99ac0e8634d603d22c9d1635914ce70a48eb5939 Mon Sep 17 00:00:00 2001 From: Alejandro Giacometti Date: Thu, 5 Apr 2018 15:58:24 +0100 Subject: [PATCH 03/11] Tests for comparisons of objects containing unicode --- pandas/tests/util/test_testing.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index 1c878604b11a2..7c5e95bcd7f76 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -276,6 +276,19 @@ def test_numpy_array_equal_message(self): assert_almost_equal(np.array([[1, 2], [3, 4]]), np.array([[1, 3], [3, 4]])) + expected = """numpy array are different + +numpy array values are different \\(33\\.33333 %\\) +\\[left\\]: \\[á, à, ä\\] +\\[right\\]: \\[á, à, å\\]""" + + with tm.assert_raises_regex(AssertionError, expected): + assert_numpy_array_equal(np.array([u"á", u"à", u"ä"]), + np.array([u"á", u"à", u"å"])) + with tm.assert_raises_regex(AssertionError, expected): + assert_almost_equal(np.array([u"á", u"à", u"ä"]), + np.array([u"á", u"à", u"å"])) + # allow to overwrite message expected = """Index are different @@ -499,10 +512,12 @@ def _assert_not_equal(self, a, b, **kwargs): def test_equal(self): self._assert_equal(Series(range(3)), Series(range(3))) self._assert_equal(Series(list('abc')), Series(list('abc'))) + self._assert_equal(Series(list(u'áàä')), Series(list(u'áàä'))) def test_not_equal(self): self._assert_not_equal(Series(range(3)), Series(range(3)) + 1) self._assert_not_equal(Series(list('abc')), Series(list('xyz'))) + self._assert_not_equal(Series(list(u'áàä')), Series(list(u'éèë'))) self._assert_not_equal(Series(range(3)), Series(range(4))) self._assert_not_equal( Series(range(3)), Series( @@ -678,6 +693,21 @@ def test_frame_equal_message(self): pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 7]}), by_blocks=True) + expected = """DataFrame\\.iloc\\[:, 1\\] are different + +DataFrame\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) +\\[left\\]: \\[é, è, ë\\] +\\[right\\]: \\[é, è, e̊\\]""" + + with tm.assert_raises_regex(AssertionError, expected): + assert_frame_equal(pd.DataFrame({'A': [u"á", u"à", u"ä"], 'E': [u"é", u"è", u"ë"]}), + pd.DataFrame({'A': [u"á", u"à", u"ä"], 'E': [u"é", u"è", u"e̊"]})) + + with tm.assert_raises_regex(AssertionError, expected): + assert_frame_equal(pd.DataFrame({'A': [u"á", u"à", u"ä"], 'E': [u"é", u"è", u"ë"]}), + pd.DataFrame({'A': [u"á", u"à", u"ä"], 'E': [u"é", u"è", u"e̊"]}), + by_blocks=True) + class TestAssertCategoricalEqual(object): From 329002c5a38fe217e1fb47337c82a988a11a27d6 Mon Sep 17 00:00:00 2001 From: Alejandro Giacometti Date: Thu, 5 Apr 2018 15:59:27 +0100 Subject: [PATCH 04/11] Only need to pprint with the display encoding --- pandas/util/testing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index cb3cc0e1c6a76..9582947945c0f 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -31,7 +31,7 @@ is_interval_dtype, is_sequence, is_list_like) -from pandas.io.formats.printing import pprint_thing +from pandas.io.formats.printing import pprint_thing_encoded from pandas.core.algorithms import take_1d import pandas.core.common as com @@ -989,11 +989,11 @@ def assert_categorical_equal(left, right, check_dtype=True, def raise_assert_detail(obj, message, left, right, diff=None): if isinstance(left, np.ndarray): - left = repr(pprint_thing(left)) + left = pprint_thing_encoded(left, encoding=pd.options.display.encoding) elif is_categorical_dtype(left): left = repr(left) if isinstance(right, np.ndarray): - right = repr(pprint_thing(right)) + right = pprint_thing_encoded(right, encoding=pd.options.display.encoding) elif is_categorical_dtype(right): right = repr(right) From b506cf6ae2f8abca1119bc8a88eaa0e303063a63 Mon Sep 17 00:00:00 2001 From: Alejandro Giacometti Date: Thu, 5 Apr 2018 16:09:20 +0100 Subject: [PATCH 05/11] Linting --- pandas/tests/util/test_testing.py | 20 ++++++++++++-------- pandas/util/testing.py | 6 ++++-- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index 7c5e95bcd7f76..7d14b99d611f9 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -283,11 +283,11 @@ def test_numpy_array_equal_message(self): \\[right\\]: \\[á, à, å\\]""" with tm.assert_raises_regex(AssertionError, expected): - assert_numpy_array_equal(np.array([u"á", u"à", u"ä"]), - np.array([u"á", u"à", u"å"])) + assert_numpy_array_equal(np.array([u'á', u'à', u'ä']), + np.array([u'á', u'à', u'å'])) with tm.assert_raises_regex(AssertionError, expected): - assert_almost_equal(np.array([u"á", u"à", u"ä"]), - np.array([u"á", u"à", u"å"])) + assert_almost_equal(np.array([u'á', u'à', u'ä']), + np.array([u'á', u'à', u'å'])) # allow to overwrite message expected = """Index are different @@ -700,12 +700,16 @@ def test_frame_equal_message(self): \\[right\\]: \\[é, è, e̊\\]""" with tm.assert_raises_regex(AssertionError, expected): - assert_frame_equal(pd.DataFrame({'A': [u"á", u"à", u"ä"], 'E': [u"é", u"è", u"ë"]}), - pd.DataFrame({'A': [u"á", u"à", u"ä"], 'E': [u"é", u"è", u"e̊"]})) + assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'], + 'E': [u'é', u'è', u'ë']}), + pd.DataFrame({'A': [u'á', u'à', u'ä'], + 'E': [u'é', u'è', u'e̊']})) with tm.assert_raises_regex(AssertionError, expected): - assert_frame_equal(pd.DataFrame({'A': [u"á", u"à", u"ä"], 'E': [u"é", u"è", u"ë"]}), - pd.DataFrame({'A': [u"á", u"à", u"ä"], 'E': [u"é", u"è", u"e̊"]}), + assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'], + 'E': [u'é', u'è', u'ë']}), + pd.DataFrame({'A': [u'á', u'à', u'ä'], + 'E': [u'é', u'è', u'e̊']}), by_blocks=True) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 9582947945c0f..2f4375f98f702 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -989,11 +989,13 @@ def assert_categorical_equal(left, right, check_dtype=True, def raise_assert_detail(obj, message, left, right, diff=None): if isinstance(left, np.ndarray): - left = pprint_thing_encoded(left, encoding=pd.options.display.encoding) + left = pprint_thing_encoded(left, + encoding=pd.options.display.encoding) elif is_categorical_dtype(left): left = repr(left) if isinstance(right, np.ndarray): - right = pprint_thing_encoded(right, encoding=pd.options.display.encoding) + right = pprint_thing_encoded(right, + encoding=pd.options.display.encoding) elif is_categorical_dtype(right): right = repr(right) From 8f607c3b0e78858ea3cde8148854e90892f33f4e Mon Sep 17 00:00:00 2001 From: Alejandro Giacometti Date: Thu, 5 Apr 2018 16:27:46 +0100 Subject: [PATCH 06/11] Whatnew update --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 6571c1d3690ad..ca46f94752731 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1165,4 +1165,4 @@ Other - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) - Bug in accessing a :func:`pandas.get_option`, which raised ``KeyError`` rather than ``OptionError`` when looking up a non-existant option key in some cases (:issue:`19789`) -- Bug in :func:`raise_assert_detail` for Series and DataFrames with differing unicode data (:issue:`20503`) +- Bug in :func:`assert_series_equal` and :func:`assert_frame_equal` for Series or DataFrames with differing unicode data (:issue:`20503`) From 6b087f286a88fc086144e5871b1540b0a464686d Mon Sep 17 00:00:00 2001 From: Alejandro Giacometti Date: Thu, 5 Apr 2018 16:46:39 +0100 Subject: [PATCH 07/11] Separate tests and document gh issue --- pandas/tests/util/test_testing.py | 36 ++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index 7d14b99d611f9..4668e1bec43e3 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -276,19 +276,6 @@ def test_numpy_array_equal_message(self): assert_almost_equal(np.array([[1, 2], [3, 4]]), np.array([[1, 3], [3, 4]])) - expected = """numpy array are different - -numpy array values are different \\(33\\.33333 %\\) -\\[left\\]: \\[á, à, ä\\] -\\[right\\]: \\[á, à, å\\]""" - - with tm.assert_raises_regex(AssertionError, expected): - assert_numpy_array_equal(np.array([u'á', u'à', u'ä']), - np.array([u'á', u'à', u'å'])) - with tm.assert_raises_regex(AssertionError, expected): - assert_almost_equal(np.array([u'á', u'à', u'ä']), - np.array([u'á', u'à', u'å'])) - # allow to overwrite message expected = """Index are different @@ -303,6 +290,24 @@ def test_numpy_array_equal_message(self): assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5]), obj='Index') + def test_numpy_array_equal_unicode_message(self): + # Test ensures that `assert_numpy_array_equals` raises the right + # exception when comparing np.arrays containing differing + # unicode objects (#20503) + + expected = """numpy array are different + +numpy array values are different \\(33\\.33333 %\\) +\\[left\\]: \\[á, à, ä\\] +\\[right\\]: \\[á, à, å\\]""" + + with tm.assert_raises_regex(AssertionError, expected): + assert_numpy_array_equal(np.array([u'á', u'à', u'ä']), + np.array([u'á', u'à', u'å'])) + with tm.assert_raises_regex(AssertionError, expected): + assert_almost_equal(np.array([u'á', u'à', u'ä']), + np.array([u'á', u'à', u'å'])) + @td.skip_if_windows def test_numpy_array_equal_object_message(self): @@ -693,6 +698,11 @@ def test_frame_equal_message(self): pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 7]}), by_blocks=True) + def test_frame_equal_message_unicode(self): + # Test ensures that `assert_frame_equals` raises the right + # exception when comparing DataFrames containing differing + # unicode objects (#20503) + expected = """DataFrame\\.iloc\\[:, 1\\] are different DataFrame\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) From 51366f272ee2e4bd03820a91a3668e6e3c13a998 Mon Sep 17 00:00:00 2001 From: Alejandro Giacometti Date: Mon, 9 Apr 2018 16:25:43 +0100 Subject: [PATCH 08/11] Encode in utf-8 only in python2 --- pandas/util/testing.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 2f4375f98f702..007c751687c35 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -31,7 +31,7 @@ is_interval_dtype, is_sequence, is_list_like) -from pandas.io.formats.printing import pprint_thing_encoded +from pandas.io.formats.printing import pprint_thing from pandas.core.algorithms import take_1d import pandas.core.common as com @@ -989,16 +989,21 @@ def assert_categorical_equal(left, right, check_dtype=True, def raise_assert_detail(obj, message, left, right, diff=None): if isinstance(left, np.ndarray): - left = pprint_thing_encoded(left, - encoding=pd.options.display.encoding) + left = pprint_thing(left) elif is_categorical_dtype(left): left = repr(left) + + if compat.PY2 and isinstance(left, compat.string_types): + left = left.encode('utf-8') + if isinstance(right, np.ndarray): - right = pprint_thing_encoded(right, - encoding=pd.options.display.encoding) + right = pprint_thing(right) elif is_categorical_dtype(right): right = repr(right) + if compat.PY2 and isinstance(right, compat.string_types): + right = right.encode('utf-8') + msg = """{obj} are different {message} From 575b2e8edf19559e8e9ad361fcef210d35786c73 Mon Sep 17 00:00:00 2001 From: Alejandro Giacometti Date: Mon, 9 Apr 2018 18:02:36 +0100 Subject: [PATCH 09/11] import compat.PY2 and compat.string_types directly --- pandas/util/testing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 007c751687c35..13250d8440149 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -38,7 +38,7 @@ import pandas.compat as compat from pandas.compat import ( filter, map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter, - raise_with_traceback, httplib, StringIO, PY3) + raise_with_traceback, httplib, StringIO, string_types, PY3, PY2) from pandas import (bdate_range, CategoricalIndex, Categorical, IntervalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, RangeIndex, @@ -993,7 +993,7 @@ def raise_assert_detail(obj, message, left, right, diff=None): elif is_categorical_dtype(left): left = repr(left) - if compat.PY2 and isinstance(left, compat.string_types): + if PY2 and isinstance(left, string_types): left = left.encode('utf-8') if isinstance(right, np.ndarray): @@ -1001,7 +1001,7 @@ def raise_assert_detail(obj, message, left, right, diff=None): elif is_categorical_dtype(right): right = repr(right) - if compat.PY2 and isinstance(right, compat.string_types): + if PY2 and isinstance(right, string_types): right = right.encode('utf-8') msg = """{obj} are different From 45d2b8e9990a3b5df851c3b373ea70ed8b801b83 Mon Sep 17 00:00:00 2001 From: Alejandro Giacometti Date: Mon, 9 Apr 2018 18:04:35 +0100 Subject: [PATCH 10/11] Added documenting comments --- pandas/util/testing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 13250d8440149..e1484a9c1b390 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -994,6 +994,7 @@ def raise_assert_detail(obj, message, left, right, diff=None): left = repr(left) if PY2 and isinstance(left, string_types): + # left needs to be printable in native text type in python2 left = left.encode('utf-8') if isinstance(right, np.ndarray): @@ -1002,6 +1003,7 @@ def raise_assert_detail(obj, message, left, right, diff=None): right = repr(right) if PY2 and isinstance(right, string_types): + # right needs to be printable in native text type in python2 right = right.encode('utf-8') msg = """{obj} are different From 1f7e231fd329330f4a9dbfe33aeafa10968d0ee7 Mon Sep 17 00:00:00 2001 From: Alejandro Giacometti Date: Mon, 9 Apr 2018 18:33:33 +0100 Subject: [PATCH 11/11] Add binary <-> unicode tests --- pandas/tests/util/test_testing.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index 4668e1bec43e3..d6f58d16bcf64 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -523,6 +523,7 @@ def test_not_equal(self): self._assert_not_equal(Series(range(3)), Series(range(3)) + 1) self._assert_not_equal(Series(list('abc')), Series(list('xyz'))) self._assert_not_equal(Series(list(u'áàä')), Series(list(u'éèë'))) + self._assert_not_equal(Series(list(u'áàä')), Series(list(b'aaa'))) self._assert_not_equal(Series(range(3)), Series(range(4))) self._assert_not_equal( Series(range(3)), Series( @@ -722,6 +723,25 @@ def test_frame_equal_message_unicode(self): 'E': [u'é', u'è', u'e̊']}), by_blocks=True) + expected = """DataFrame\\.iloc\\[:, 0\\] are different + +DataFrame\\.iloc\\[:, 0\\] values are different \\(100\\.0 %\\) +\\[left\\]: \\[á, à, ä\\] +\\[right\\]: \\[a, a, a\\]""" + + with tm.assert_raises_regex(AssertionError, expected): + assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'], + 'E': [u'é', u'è', u'ë']}), + pd.DataFrame({'A': ['a', 'a', 'a'], + 'E': ['e', 'e', 'e']})) + + with tm.assert_raises_regex(AssertionError, expected): + assert_frame_equal(pd.DataFrame({'A': [u'á', u'à', u'ä'], + 'E': [u'é', u'è', u'ë']}), + pd.DataFrame({'A': ['a', 'a', 'a'], + 'E': ['e', 'e', 'e']}), + by_blocks=True) + class TestAssertCategoricalEqual(object):