From bf5d59e215c5df4d31710d17afaaabf499e22114 Mon Sep 17 00:00:00 2001
From: Robert de Vries <rhdv@xs4all.nl>
Date: Wed, 11 Mar 2020 14:45:29 +0100
Subject: [PATCH 1/6] Fix segfault on dir of a DataFrame with an unicode
 surrogate character in the column name

Return a repr() version if a string is not printable
---
 doc/source/whatsnew/v1.1.0.rst |  1 +
 pandas/_libs/tslibs/util.pxd   | 10 ++++------
 pandas/tests/frame/test_api.py |  8 ++++++++
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 0d3a9a8f969a4..b97dd580e2d8a 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -402,6 +402,7 @@ Other
 - Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`).
 - :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:32538`)
 - Fixed bug in :func:`pandas.testing.assert_series_equal` where dtypes were checked for ``Interval`` and ``ExtensionArray`` operands when ``check_dtype`` was ``False`` (:issue:`32747`)
+- Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd
index e7f6b3334eb65..0537cd3fb46a6 100644
--- a/pandas/_libs/tslibs/util.pxd
+++ b/pandas/_libs/tslibs/util.pxd
@@ -236,13 +236,11 @@ cdef inline const char* get_c_string_buf_and_size(str py_string,
 
     Returns
     -------
-    buf : const char*
+    c_string_buf : const char*
     """
-    cdef:
-        const char *buf
-
-    buf = PyUnicode_AsUTF8AndSize(py_string, length)
-    return buf
+    if not py_string.isprintable():
+        return PyUnicode_AsUTF8AndSize(repr(py_string), length)
+    return PyUnicode_AsUTF8AndSize(py_string, length)
 
 
 cdef inline const char* get_c_string(str py_string):
diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py
index a021dd91a7d26..940a76601b75e 100644
--- a/pandas/tests/frame/test_api.py
+++ b/pandas/tests/frame/test_api.py
@@ -127,6 +127,14 @@ def test_not_hashable(self):
         with pytest.raises(TypeError, match=msg):
             hash(empty_frame)
 
+    def test_column_name_contains_unicode_surrogate(self):
+        # GH 25509
+        colname = "\ud83d"
+        df = DataFrame({colname: []})
+        # this should not crash
+        assert colname not in dir(df)
+        assert df.columns[0] == colname
+
     def test_new_empty_index(self):
         df1 = DataFrame(np.random.randn(0, 3))
         df2 = DataFrame(np.random.randn(0, 3))

From 611cca958b28b80733571261ee4ca82fee96080c Mon Sep 17 00:00:00 2001
From: Robert de Vries <rhdv@xs4all.nl>
Date: Sat, 14 Mar 2020 18:28:23 +0100
Subject: [PATCH 2/6] Fix failing test, the separator in the data is actually a
 tab, not a comma

---
 pandas/tests/io/parser/test_dtypes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py
index 11dcf7f04f76b..e68dcb3aa577e 100644
--- a/pandas/tests/io/parser/test_dtypes.py
+++ b/pandas/tests/io/parser/test_dtypes.py
@@ -192,7 +192,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
     pth = os.path.join(csv_dir_path, "utf16_ex.txt")
     parser = all_parsers
     encoding = "utf-16"
-    sep = ","
+    sep = "\t"
 
     expected = parser.read_csv(pth, sep=sep, encoding=encoding)
     expected = expected.apply(Categorical)

From 6a3e986e20a2fb32e5c71c3c787738970a2e1da8 Mon Sep 17 00:00:00 2001
From: Robert de Vries <rhdv@xs4all.nl>
Date: Sun, 15 Mar 2020 10:53:03 +0100
Subject: [PATCH 3/6] Pushed the fix a bit higher up in the call chain to
 prevent unneeded impact

---
 pandas/_libs/hashtable_class_helper.pxi.in |  2 +-
 pandas/_libs/tslibs/util.pxd               | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 811025a4b5764..5516bc67523bd 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -789,7 +789,7 @@ cdef class StringHashTable(HashTable):
                 labels[i] = na_sentinel
             else:
                 # if ignore_na is False, we also stringify NaN/None/etc.
-                v = get_c_string(<str>val)
+                v = get_c_string(<str>val) if val.isprintable() else get_c_string(<str>repr(val))
                 vecs[i] = v
 
         # compute
diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd
index 0537cd3fb46a6..e7f6b3334eb65 100644
--- a/pandas/_libs/tslibs/util.pxd
+++ b/pandas/_libs/tslibs/util.pxd
@@ -236,11 +236,13 @@ cdef inline const char* get_c_string_buf_and_size(str py_string,
 
     Returns
     -------
-    c_string_buf : const char*
+    buf : const char*
     """
-    if not py_string.isprintable():
-        return PyUnicode_AsUTF8AndSize(repr(py_string), length)
-    return PyUnicode_AsUTF8AndSize(py_string, length)
+    cdef:
+        const char *buf
+
+    buf = PyUnicode_AsUTF8AndSize(py_string, length)
+    return buf
 
 
 cdef inline const char* get_c_string(str py_string):

From 27da1305a2b22db9ea6cbf67892aeee4190bdfc2 Mon Sep 17 00:00:00 2001
From: Robert de Vries <rhdv@xs4all.nl>
Date: Sun, 15 Mar 2020 12:03:06 +0100
Subject: [PATCH 4/6] Fix linting error

---
 pandas/_libs/hashtable_class_helper.pxi.in | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 5516bc67523bd..232453b03d538 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -789,7 +789,8 @@ cdef class StringHashTable(HashTable):
                 labels[i] = na_sentinel
             else:
                 # if ignore_na is False, we also stringify NaN/None/etc.
-                v = get_c_string(<str>val) if val.isprintable() else get_c_string(<str>repr(val))
+                v = get_c_string(<str>val) if val.isprintable() else \
+                    get_c_string(<str>repr(val))
                 vecs[i] = v
 
         # compute

From a52e59cd6e912b8252760f4affd7816ad1590ebf Mon Sep 17 00:00:00 2001
From: Robert de Vries <rhdv@xs4all.nl>
Date: Wed, 18 Mar 2020 21:37:42 +0100
Subject: [PATCH 5/6] Performance fix

---
 pandas/_libs/hashtable_class_helper.pxi.in | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 232453b03d538..1fa672be6f1f9 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -12,6 +12,11 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 from pandas._libs.tslibs.util cimport get_c_string
 from pandas._libs.missing cimport C_NA
 
+cdef extern from "Python.h":
+    # Note: importing extern-style allows us to declare these as nogil
+    # functions, whereas `from cpython cimport` does not.
+    void PyErr_Clear() nogil
+
 {{py:
 
 # name, dtype, c_type
@@ -789,8 +794,10 @@ cdef class StringHashTable(HashTable):
                 labels[i] = na_sentinel
             else:
                 # if ignore_na is False, we also stringify NaN/None/etc.
-                v = get_c_string(<str>val) if val.isprintable() else \
-                    get_c_string(<str>repr(val))
+                v = get_c_string(<str>val)
+                if v == NULL:
+                    PyErr_Clear()
+                    v = get_c_string(<str>repr(val))
                 vecs[i] = v
 
         # compute

From b2593bd2110e073056437406b07dc36034367efd Mon Sep 17 00:00:00 2001
From: Robert de Vries <rhdv@xs4all.nl>
Date: Wed, 18 Mar 2020 23:49:17 +0100
Subject: [PATCH 6/6] Remove nogil on PyErr_Clear()

---
 pandas/_libs/hashtable_class_helper.pxi.in | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 1fa672be6f1f9..d662e03304e2e 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -13,9 +13,7 @@ from pandas._libs.tslibs.util cimport get_c_string
 from pandas._libs.missing cimport C_NA
 
 cdef extern from "Python.h":
-    # Note: importing extern-style allows us to declare these as nogil
-    # functions, whereas `from cpython cimport` does not.
-    void PyErr_Clear() nogil
+    void PyErr_Clear()
 
 {{py: