BUG: Fix segfault on dir of a DataFrame with a unicode surrogate character in the column name (pandas-dev#32701)

roberthdevries · jbrockmendel · commit 1366dbf8371a · 2020-03-21T18:26:43.000-07:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -405,6 +405,7 @@ Other
 - Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`).
 - :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:32538`)
 - Fixed bug in :func:`pandas.testing.assert_series_equal` where dtypes were checked for ``Interval`` and ``ExtensionArray`` operands when ``check_dtype`` was ``False`` (:issue:`32747`)
+- Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -12,6 +12,9 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 from pandas._libs.tslibs.util cimport get_c_string
 from pandas._libs.missing cimport C_NA
 
+cdef extern from "Python.h":
+    void PyErr_Clear()
+
 {{py:
 
 # name, dtype, c_type
@@ -790,6 +793,9 @@ cdef class StringHashTable(HashTable):
             else:
                 # if ignore_na is False, we also stringify NaN/None/etc.
                 v = get_c_string(<str>val)
+                if v == NULL:
+                    PyErr_Clear()
+                    v = get_c_string(<str>repr(val))
                 vecs[i] = v
 
         # compute
diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py
@@ -127,6 +127,14 @@ def test_not_hashable(self):
         with pytest.raises(TypeError, match=msg):
             hash(empty_frame)
 
+    def test_column_name_contains_unicode_surrogate(self):
+        # GH 25509
+        colname = "\ud83d"
+        df = DataFrame({colname: []})
+        # this should not crash
+        assert colname not in dir(df)
+        assert df.columns[0] == colname
+
     def test_new_empty_index(self):
         df1 = DataFrame(np.random.randn(0, 3))
         df2 = DataFrame(np.random.randn(0, 3))
diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py
@@ -192,7 +192,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
     pth = os.path.join(csv_dir_path, "utf16_ex.txt")
     parser = all_parsers
     encoding = "utf-16"
-    sep = ","
+    sep = "\t"
 
     expected = parser.read_csv(pth, sep=sep, encoding=encoding)
     expected = expected.apply(Categorical)