BUG: Add extra check for failing UTF-8 conversion (#32548)

roberthdevries · web-flow · commit 1b76440e1c9b · 2020-03-11T17:54:06.000-07:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -305,6 +305,7 @@ I/O
   timestamps with ``version="2.0"`` (:issue:`31652`).
 - Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`)
 - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`)
+- Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`)
 
 
 Plotting
diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h
@@ -34,6 +34,9 @@ int floatify(PyObject *str, double *result, int *maybe_int) {
         data = PyBytes_AS_STRING(str);
     } else if (PyUnicode_Check(str)) {
         tmp = PyUnicode_AsUTF8String(str);
+        if (tmp == NULL) {
+            return -1;
+        }
         data = PyBytes_AS_STRING(tmp);
     } else {
         PyErr_SetString(PyExc_TypeError, "Invalid object type");
diff --git a/pandas/tests/io/data/excel/high_surrogate.xlsx b/pandas/tests/io/data/excel/high_surrogate.xlsx
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -1044,3 +1044,11 @@ def test_excel_read_binary(self, engine, read_ext):
 
         actual = pd.read_excel(data, engine=engine)
         tm.assert_frame_equal(expected, actual)
+
+    def test_excel_high_surrogate(self, engine):
+        # GH 23809
+        expected = pd.DataFrame(["\udc88"], columns=["Column1"])
+
+        # should not produce a segmentation violation
+        actual = pd.read_excel("high_surrogate.xlsx")
+        tm.assert_frame_equal(expected, actual)