Fix segfault on dir of a DataFrame with an unicode surrogate character in the column name

roberthdevries · roberthdevries · commit ba1a9798aa96 · 2020-03-14T23:28:32.000+01:00
Return a repr() version if a string is not printable
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -389,6 +389,7 @@ Other
 - Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`)
 - Bug in :meth:`AbstractHolidayCalendar.holidays` when no rules were defined (:issue:`31415`)
 - Bug in :meth:`DataFrame.to_records` incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`)
+- Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd
@@ -236,13 +236,11 @@ cdef inline const char* get_c_string_buf_and_size(str py_string,
 
     Returns
     -------
-    buf : const char*
+    c_string_buf : const char*
     """
-    cdef:
-        const char *buf
-
-    buf = PyUnicode_AsUTF8AndSize(py_string, length)
-    return buf
+    if not py_string.isprintable():
+        return PyUnicode_AsUTF8AndSize(repr(py_string), length)
+    return PyUnicode_AsUTF8AndSize(py_string, length)
 
 
 cdef inline const char* get_c_string(str py_string):
diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py
@@ -127,6 +127,14 @@ def test_not_hashable(self):
         with pytest.raises(TypeError, match=msg):
             hash(empty_frame)
 
+    def test_column_name_contains_unicode_surrogate(self):
+        # GH 25509
+        colname = "\ud83d"
+        df = DataFrame({colname: []})
+        # this should not crash
+        assert colname not in dir(df)
+        assert df.columns[0] == colname
+
     def test_new_empty_index(self):
         df1 = DataFrame(np.random.randn(0, 3))
         df2 = DataFrame(np.random.randn(0, 3))