From e966c261caf42e25852b9210b55b35d287288aa2 Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Mon, 18 Jul 2016 10:42:06 -0400 Subject: [PATCH 1/3] Test case for patch, plus fix to not swallow exceptions --- pandas/io/tests/parser/common.py | 15 +++++++++++++++ pandas/io/tests/parser/data/sauron.SHIFT_JIS.csv | 14 ++++++++++++++ pandas/io/tests/parser/test_parsers.py | 1 + pandas/parser.pyx | 15 ++++++++++++++- 4 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 pandas/io/tests/parser/data/sauron.SHIFT_JIS.csv diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 670f3df6f3984..421273bea5e8a 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -3,6 +3,7 @@ import csv import os import platform +import codecs import re import sys @@ -45,6 +46,20 @@ def test_empty_decimal_marker(self): with tm.assertRaisesRegexp(ValueError, msg): self.read_csv(StringIO(data), decimal='') + def test_bad_stream_exception(self): + handle = open(self.csv_shiftjs, "rb") + codec = codecs.lookup("utf-8") + utf8 = codecs.lookup('utf-8') + # stream must be binary UTF8 + stream = codecs.StreamRecoder( + handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter) + if compat.PY3: + msg = "'utf-8' codec can't decode byte" + else: + msg = "'utf8' codec can't decode byte" + with tm.assertRaisesRegexp(UnicodeDecodeError, msg): + self.read_csv(stream) + def test_read_csv(self): if not compat.PY3: if compat.is_platform_windows(): diff --git a/pandas/io/tests/parser/data/sauron.SHIFT_JIS.csv b/pandas/io/tests/parser/data/sauron.SHIFT_JIS.csv new file mode 100644 index 0000000000000..218ddf333ef52 --- /dev/null +++ b/pandas/io/tests/parser/data/sauron.SHIFT_JIS.csv @@ -0,0 +1,14 @@ +num, text +1,�T�E�����iSauron�A�A�C�k�A�̑n���̎� - ��O�I3019�N3��25���j�́AJ�ER�ER�E�g�[���L���̒����𕑑�Ƃ��������w�z�r�b�g�̖`���x�w�w�֕���x�w�V���}�����̕���x�̓o��l���B +2,�w�z�r�b�g�̖`���x�Ɍ��y�̂���u���l����Ȃ��t�v�i�f��w�z�r�b�g�V���[�Y�x�̎����ł́u���l�����i�l�N���}���T�[�j�v�j�Ƃ͔ނ̂��Ƃł���B +3,���̑��҂ł���w�w�֕���x�ɂ����Ắu��‚̎w�ցithe One Ring�j�v�̍���A�u�����iDark Lord�j�v�A�u���̎ҁithe One�j[1]�v�Ƃ��ēo�ꂷ��B�O�j�ɂ�����w�V���}�����̕���x�ł́A����̖��������S�X�̍ł��͂��鑤�߂ł������B +4,�T�E�����͌����A�A���_�i�n���j�̑n����S�����V�g�I�푰�A�C�k�A�̈���ł��������A�僁���R�[���̔��t�ɉ��S���đ—����A�A���_�ɊQ���Ȃ����݂ƂȂ����B +5,�u�T�E�����v�Ƃ̓N�E�F�����Łu�g�̖т̂悾�‚��́v�Ƃ����Ӗ��ł���A�V���_�����œ��l�̈Ӗ��ł��閼�O�u�S���T�E�A�v�ƌĂ΂�邱�Ƃ�����B +6,�����́A�T�E����������A���݌������G���t�ɂ�閼�ł���A�w�w�֕���x�쒆�ɂ����ăA���S�����́u����i�T�E�����j�͎����̖{���̖��͎g��Ȃ����A��������ɏ���������ɏo�����肷�邱�Ƃ������Ȃ��v�Ɣ������Ă���B +7,���̂ق��A���I�ɃG���t�ɑ΂��Ď��̂����Ƃ���閼�ɁA�u�A���i�^�[���i������N�j�v�A�u�A���^�m�i���M�ȍ׍H�t�j�v�A�u�A�E�����f�B���i�A�E���̉��l�j�v������B +8,���I�̍��̃T�E�����́A���݂ɕϐg����\�͂������Ă����B +9,���̔\�͂��g���Ό��ڗ킵�����h�ȊO���𑕂����Ƃ�A�܂�����ȘT��z����������Ƃ����������ɕς��邱�Ƃ��ł��A�G���t���狰���ꂽ�B +10,���I�Ɉ�‚̎w�ւ����グ���T�E�����́A���̗͂̎w�ւŐ�����鎖���₻�̏��L�҂��x�z�ł���悤�ɂȂ����B +11,�܂��A���̂��łтĂ��w�ւ�������艽�x�ł��h�邱�Ƃ��ł����B +12,�������k�[���m�[���v���̍ۂɔ��������̂�j�󂳂ꂽ��́A��x�Ɣ������ϐg���邱�Ƃ͂ł��Ȃ��Ȃ�A���̈��ӂ̋�̂悤�Ȍ�������낵���p�����Ƃ�Ȃ��Ȃ����Ƃ����B +13,�܂����΂��΁u�܂Ԃ��̂Ȃ��΂ɉ����ꂽ�ځv�Ƃ������S�ە\���ő�����ꂽ�B diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py index 21f903342a611..6001c85ae76b1 100644 --- a/pandas/io/tests/parser/test_parsers.py +++ b/pandas/io/tests/parser/test_parsers.py @@ -44,6 +44,7 @@ def setUp(self): self.csv1 = os.path.join(self.dirpath, 'test1.csv') self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') + self.csv_shiftjs = os.path.join(self.dirpath, 'sauron.SHIFT_JIS.csv') class TestCParserHighMemory(BaseParser, CParserTests, tm.TestCase): diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 3928bc8472113..61a1e038b89ce 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -10,7 +10,9 @@ import warnings from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE from cpython cimport (PyObject, PyBytes_FromString, PyBytes_AsString, PyBytes_Check, - PyUnicode_Check, PyUnicode_AsUTF8String) + PyUnicode_Check, PyUnicode_AsUTF8String, + PyErr_Occurred, PyErr_Fetch) +from cpython.ref cimport PyObject, Py_XDECREF from io.common import CParserError, DtypeWarning, EmptyDataError @@ -1878,6 +1880,17 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL: cdef raise_parser_error(object base, parser_t *parser): + cdef: + object old_exc + PyObject *type, *value, *traceback + if PyErr_Occurred(): + PyErr_Fetch(&type, &value, &traceback); + Py_XDECREF(type) + Py_XDECREF(traceback) + if value != NULL: + old_exc = value + Py_XDECREF(value) + raise old_exc message = '%s. C error: ' % base if parser.error_msg != NULL: if PY3: From 6ed3a2e2aa60730de3378902c5c2ad18ae03569b Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Mon, 18 Jul 2016 10:43:38 -0400 Subject: [PATCH 2/3] Flake --- pandas/io/tests/parser/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 421273bea5e8a..2080305209ee9 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -52,7 +52,8 @@ def test_bad_stream_exception(self): utf8 = codecs.lookup('utf-8') # stream must be binary UTF8 stream = codecs.StreamRecoder( - handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter) + handle, utf8.encode, utf8.decode, codec.streamreader, + codec.streamwriter) if compat.PY3: msg = "'utf-8' codec can't decode byte" else: From 0efe18b845f3c6551d73fb1f6e1b3766709f000e Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Tue, 19 Jul 2016 07:08:47 -0400 Subject: [PATCH 3/3] Address review comments --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/io/tests/parser/common.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f65f7d57d5d08..068207936818c 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -271,7 +271,7 @@ API changes - ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) - Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`) - ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) - +- More informative exceptions are passed through the parser. The exception type would now be the original exception type instead of ``CParserError``. (:issue `13652`) .. _whatsnew_0190.api.tolist: diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 2080305209ee9..11eed79e03267 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -47,6 +47,11 @@ def test_empty_decimal_marker(self): self.read_csv(StringIO(data), decimal='') def test_bad_stream_exception(self): + # Issue 13652: + # This test validates that both python engine + # and C engine will raise UnicodeDecodeError instead of + # c engine raising CParserError and swallowing exception + # that caused read to fail. handle = open(self.csv_shiftjs, "rb") codec = codecs.lookup("utf-8") utf8 = codecs.lookup('utf-8') @@ -60,6 +65,7 @@ def test_bad_stream_exception(self): msg = "'utf8' codec can't decode byte" with tm.assertRaisesRegexp(UnicodeDecodeError, msg): self.read_csv(stream) + stream.close() def test_read_csv(self): if not compat.PY3: