ERR: csv parser exceptions will now bubble up

bdrosen96 · jreback · commit 210fea9d4dc4 · 2016-07-20T17:53:42.000-04:00
closes #13652 Author: Brett Rosen <brett@datarobot.com> Closes #13693 from bdrosen96/brett/dont_swallow_exc and squashes the following commits: 0efe18b [Brett Rosen] Address review comments 6ed3a2e [Brett Rosen] Flake e966c26 [Brett Rosen] Test case for patch, plus fix to not swallow exceptions
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -309,15 +309,15 @@ Other enhancements
 - A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals<categorical.union>` (:issue:`13361`)
 - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`)
 - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`)
-- ``.to_stata()`` and ```StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`)
+- ``.to_stata()`` and ``StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`)
 
 .. _whatsnew_0190.api:
 
 API changes
 ~~~~~~~~~~~
 
 
-- ``Index.reshape`` will raise a ``NotImplementedError`` exception when called (:issue: `12882`)
+- ``Index.reshape`` will raise a ``NotImplementedError`` exception when called (:issue:`12882`)
 - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception  (:issue:`10001`)
 - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules.  New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`)
 - An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`)
@@ -330,7 +330,7 @@ API changes
 - ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`)
 - Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`)
 - ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`)
-
+- More informative exceptions are passed through the csv parser. The exception type would now be the original exception type instead of ``CParserError``. (:issue:`13652`)
 
 .. _whatsnew_0190.api.tolist:
 
@@ -595,7 +595,6 @@ Deprecations
 
 Removal of prior version deprecations/changes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
 - The ``pd.sandbox`` module has been removed in favor of the external library ``pandas-qt`` (:issue:`13670`)
 - ``DataFrame.to_csv()`` has dropped the ``engine`` parameter, as was deprecated in 0.17.1 (:issue:`11274`, :issue:`13419`)
 - ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`)
@@ -689,8 +688,8 @@ Bug Fixes
 - Bug in ``pd.read_csv()`` with ``engine='python'`` when reading from a ``tempfile.TemporaryFile`` on Windows with Python 3 (:issue:`13398`)
 - Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`)
 - Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`)
-- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`)
-- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`)
+- Bug in ``pd.read_csv()`` with ``engine='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`)
+- Bug in ``pd.read_csv()`` with ``engine='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`)
 - Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`)
 - Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`)
 - Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`)
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -3,6 +3,7 @@
 import csv
 import os
 import platform
+import codecs
 
 import re
 import sys
@@ -45,6 +46,27 @@ def test_empty_decimal_marker(self):
         with tm.assertRaisesRegexp(ValueError, msg):
             self.read_csv(StringIO(data), decimal='')
 
+    def test_bad_stream_exception(self):
+        # Issue 13652:
+        # This test validates that both python engine
+        # and C engine will raise UnicodeDecodeError instead of
+        # c engine raising CParserError and swallowing exception
+        # that caused read to fail.
+        handle = open(self.csv_shiftjs, "rb")
+        codec = codecs.lookup("utf-8")
+        utf8 = codecs.lookup('utf-8')
+        # stream must be binary UTF8
+        stream = codecs.StreamRecoder(
+            handle, utf8.encode, utf8.decode, codec.streamreader,
+            codec.streamwriter)
+        if compat.PY3:
+            msg = "'utf-8' codec can't decode byte"
+        else:
+            msg = "'utf8' codec can't decode byte"
+        with tm.assertRaisesRegexp(UnicodeDecodeError, msg):
+            self.read_csv(stream)
+        stream.close()
+
     def test_read_csv(self):
         if not compat.PY3:
             if compat.is_platform_windows():
diff --git a/pandas/io/tests/parser/data/sauron.SHIFT_JIS.csv b/pandas/io/tests/parser/data/sauron.SHIFT_JIS.csv
@@ -0,0 +1,14 @@
+num, text
+1,�T�E�����iSauron�A�A�C�k�A�̑n���̎� - ��O�I3019�N3��25���j�́AJ�ER�ER�E�g�[���L���̒����𕑑�Ƃ��������w�z�r�b�g�̖`���x�w�w�֕���x�w�V���}�����̕���x�̓o��l���B
+2,�w�z�r�b�g�̖`���x�Ɍ��y�̂���u���l����Ȃ��t�v�i�f��w�z�r�b�g�V���[�Y�x�̎����ł́u���l�����i�l�N���}���T�[�j�v�j�Ƃ͔ނ̂��Ƃł���B
+3,���̑��҂ł���w�w�֕���x�ɂ����Ắu��̎w�ցithe One Ring�j�v�̍���A�u�����iDark Lord�j�v�A�u���̎ҁithe One�j[1]�v�Ƃ��ēo�ꂷ��B�O�j�ɂ�����w�V���}�����̕���x�ł́A����̖��������S�X�̍ł��͂��鑤�߂ł������B
+4,�T�E�����͌����A�A���_�i�n���j�̑n����S�����V�g�I�푰�A�C�k�A�̈���ł��������A�僁���R�[���̔��t�ɉ��S���đ����A�A���_�ɊQ���Ȃ����݂ƂȂ����B
+5,�u�T�E�����v�Ƃ̓N�E�F�����Łu�g�̖т̂悾���́v�Ƃ����Ӗ��ł���A�V���_�����œ��l�̈Ӗ��ł��閼�O�u�S���T�E�A�v�ƌĂ΂�邱�Ƃ�����B
+6,�����́A�T�E����������A���݌������G���t�ɂ�閼�ł���A�w�w�֕���x�쒆�ɂ����ăA���S�����́u����i�T�E�����j�͎����̖{���̖��͎g��Ȃ����A��������ɏ���������ɏo�����肷�邱�Ƃ������Ȃ��v�Ɣ������Ă���B
+7,���̂ق��A���I�ɃG���t�ɑ΂��Ď��̂����Ƃ���閼�ɁA�u�A���i�^�[���i������N�j�v�A�u�A���^�m�i���M�ȍ׍H�t�j�v�A�u�A�E�����f�B���i�A�E���̉��l�j�v������B
+8,���I�̍��̃T�E�����́A���݂ɕϐg����\�͂������Ă����B
+9,���̔\�͂��g���Ό��ڗ킵�����h�ȊO���𑕂����Ƃ�A�܂�����ȘT��z����������Ƃ����������ɕς��邱�Ƃ��ł��A�G���t���狰���ꂽ�B
+10,���I�Ɉ�̎w�ւ����グ���T�E�����́A���̗͂̎w�ւŐ�����鎖���₻�̏��L�҂��x�z�ł���悤�ɂȂ����B
+11,�܂��A���̂��łтĂ��w�ւ�������艽�x�ł��h�邱�Ƃ��ł����B
+12,�������k�[���m�[���v���̍ۂɔ��������̂�j�󂳂ꂽ��́A��x�Ɣ������ϐg���邱�Ƃ͂ł��Ȃ��Ȃ�A���̈��ӂ̋�̂悤�Ȍ�������낵���p�����Ƃ�Ȃ��Ȃ����Ƃ����B
+13,�܂����΂��΁u�܂Ԃ��̂Ȃ��΂ɉ����ꂽ�ځv�Ƃ������S�ە\���ő�����ꂽ�B
diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py
@@ -44,6 +44,7 @@ def setUp(self):
         self.csv1 = os.path.join(self.dirpath, 'test1.csv')
         self.csv2 = os.path.join(self.dirpath, 'test2.csv')
         self.xls1 = os.path.join(self.dirpath, 'test.xls')
+        self.csv_shiftjs = os.path.join(self.dirpath, 'sauron.SHIFT_JIS.csv')
 
 
 class TestCParserHighMemory(BaseParser, CParserTests, tm.TestCase):
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -10,7 +10,9 @@ import warnings
 from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE
 from cpython cimport (PyObject, PyBytes_FromString,
                       PyBytes_AsString, PyBytes_Check,
-                      PyUnicode_Check, PyUnicode_AsUTF8String)
+                      PyUnicode_Check, PyUnicode_AsUTF8String,
+                      PyErr_Occurred, PyErr_Fetch)
+from cpython.ref cimport PyObject, Py_XDECREF
 from io.common import CParserError, DtypeWarning, EmptyDataError
 
 
@@ -1878,6 +1880,20 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL:
 
 
 cdef raise_parser_error(object base, parser_t *parser):
+    cdef:
+        object old_exc
+        PyObject *type
+        PyObject *value
+        PyObject *traceback
+
+    if PyErr_Occurred():
+        PyErr_Fetch(&type, &value, &traceback);
+        Py_XDECREF(type)
+        Py_XDECREF(traceback)
+        if value != NULL:
+            old_exc = <object> value
+            Py_XDECREF(value)
+            raise old_exc
     message = '%s. C error: ' % base
     if parser.error_msg != NULL:
         if PY3: