BUG: read_table and read_csv crash (pandas-dev#22748)

troels · troels · commit 6d5ecc7a3f20 · 2018-09-19T01:21:33.000+02:00
A missing null-pointer check made read_table and read_csv prone
to crash on badly encoded text. Add null-pointer check.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -745,6 +745,7 @@ I/O
 
 - :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
 - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
+- :func:`read_csv()` and func:`read_table()` will throw UnicodeEncodeError and not coredump on badly encoded strings (:issue:`22748`)
 - :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
 - :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
 - :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c
@@ -150,7 +150,11 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
         return NULL;
     } else if (!PyBytes_Check(result)) {
         tmp = PyUnicode_AsUTF8String(result);
-        Py_XDECREF(result);
+        Py_DECREF(result);
+        if (tmp == NULL) {
+            PyGILState_Release(state);
+            return NULL;
+        }
         result = tmp;
     }
 
diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py
@@ -18,7 +18,8 @@
 import pandas.util.testing as tm
 import pandas.util._test_decorators as td
 from pandas import DataFrame
-from pandas.compat import StringIO, range, lrange
+from pandas.compat import BytesIO, StringIO, range, lrange
+from io import TextIOWrapper
 
 
 class CParserTests(object):
@@ -55,6 +56,13 @@ def test_buffer_rd_bytes(self):
             except Exception:
                 pass
 
+    def test_buffer_rd_bytes_bad_unicode(self, method):
+        # Regression test for #22748
+        b = BytesIO(b"\xB0")
+        t = TextIOWrapper(b, encoding='ascii', errors='surrogateescape')
+        with pytest.raises(UnicodeEncodeError):
+            pd.read_csv(t)
+
     def test_delim_whitespace_custom_terminator(self):
         # See gh-12912
         data = """a b c~1 2 3~4 5 6~7 8 9"""