From bf5d59e215c5df4d31710d17afaaabf499e22114 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 11 Mar 2020 14:45:29 +0100 Subject: [PATCH 1/6] Fix segfault on dir of a DataFrame with an unicode surrogate character in the column name Return a repr() version if a string is not printable --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/tslibs/util.pxd | 10 ++++------ pandas/tests/frame/test_api.py | 8 ++++++++ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 0d3a9a8f969a4..b97dd580e2d8a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -402,6 +402,7 @@ Other - Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`). - :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:32538`) - Fixed bug in :func:`pandas.testing.assert_series_equal` where dtypes were checked for ``Interval`` and ``ExtensionArray`` operands when ``check_dtype`` was ``False`` (:issue:`32747`) +- Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index e7f6b3334eb65..0537cd3fb46a6 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -236,13 +236,11 @@ cdef inline const char* get_c_string_buf_and_size(str py_string, Returns ------- - buf : const char* + c_string_buf : const char* """ - cdef: - const char *buf - - buf = PyUnicode_AsUTF8AndSize(py_string, length) - return buf + if not py_string.isprintable(): + return PyUnicode_AsUTF8AndSize(repr(py_string), length) + return PyUnicode_AsUTF8AndSize(py_string, length) cdef inline const char* get_c_string(str py_string): diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index a021dd91a7d26..940a76601b75e 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -127,6 +127,14 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) + def test_column_name_contains_unicode_surrogate(self): + # GH 25509 + colname = "\ud83d" + df = DataFrame({colname: []}) + # this should not crash + assert colname not in dir(df) + assert df.columns[0] == colname + def test_new_empty_index(self): df1 = DataFrame(np.random.randn(0, 3)) df2 = DataFrame(np.random.randn(0, 3)) From 611cca958b28b80733571261ee4ca82fee96080c Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sat, 14 Mar 2020 18:28:23 +0100 Subject: [PATCH 2/6] Fix failing test, the separator in the data is actually a tab, not a comma --- pandas/tests/io/parser/test_dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 11dcf7f04f76b..e68dcb3aa577e 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -192,7 +192,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path): pth = os.path.join(csv_dir_path, "utf16_ex.txt") parser = all_parsers encoding = "utf-16" - sep = "," + sep = "\t" expected = parser.read_csv(pth, sep=sep, encoding=encoding) expected = expected.apply(Categorical) From 6a3e986e20a2fb32e5c71c3c787738970a2e1da8 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sun, 15 Mar 2020 10:53:03 +0100 Subject: [PATCH 3/6] Pushed the fix a bit higher up in the call chain to prevent unneeded impact --- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- pandas/_libs/tslibs/util.pxd | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 811025a4b5764..5516bc67523bd 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -789,7 +789,7 @@ cdef class StringHashTable(HashTable): labels[i] = na_sentinel else: # if ignore_na is False, we also stringify NaN/None/etc. - v = get_c_string(val) + v = get_c_string(val) if val.isprintable() else get_c_string(repr(val)) vecs[i] = v # compute diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 0537cd3fb46a6..e7f6b3334eb65 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -236,11 +236,13 @@ cdef inline const char* get_c_string_buf_and_size(str py_string, Returns ------- - c_string_buf : const char* + buf : const char* """ - if not py_string.isprintable(): - return PyUnicode_AsUTF8AndSize(repr(py_string), length) - return PyUnicode_AsUTF8AndSize(py_string, length) + cdef: + const char *buf + + buf = PyUnicode_AsUTF8AndSize(py_string, length) + return buf cdef inline const char* get_c_string(str py_string): From 27da1305a2b22db9ea6cbf67892aeee4190bdfc2 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sun, 15 Mar 2020 12:03:06 +0100 Subject: [PATCH 4/6] Fix linting error --- pandas/_libs/hashtable_class_helper.pxi.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 5516bc67523bd..232453b03d538 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -789,7 +789,8 @@ cdef class StringHashTable(HashTable): labels[i] = na_sentinel else: # if ignore_na is False, we also stringify NaN/None/etc. - v = get_c_string(val) if val.isprintable() else get_c_string(repr(val)) + v = get_c_string(val) if val.isprintable() else \ + get_c_string(repr(val)) vecs[i] = v # compute From a52e59cd6e912b8252760f4affd7816ad1590ebf Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 18 Mar 2020 21:37:42 +0100 Subject: [PATCH 5/6] Performance fix --- pandas/_libs/hashtable_class_helper.pxi.in | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 232453b03d538..1fa672be6f1f9 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -12,6 +12,11 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in from pandas._libs.tslibs.util cimport get_c_string from pandas._libs.missing cimport C_NA +cdef extern from "Python.h": + # Note: importing extern-style allows us to declare these as nogil + # functions, whereas `from cpython cimport` does not. + void PyErr_Clear() nogil + {{py: # name, dtype, c_type @@ -789,8 +794,10 @@ cdef class StringHashTable(HashTable): labels[i] = na_sentinel else: # if ignore_na is False, we also stringify NaN/None/etc. - v = get_c_string(val) if val.isprintable() else \ - get_c_string(repr(val)) + v = get_c_string(val) + if v == NULL: + PyErr_Clear() + v = get_c_string(repr(val)) vecs[i] = v # compute From b2593bd2110e073056437406b07dc36034367efd Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 18 Mar 2020 23:49:17 +0100 Subject: [PATCH 6/6] Remove nogil on PyErr_Clear() --- pandas/_libs/hashtable_class_helper.pxi.in | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 1fa672be6f1f9..d662e03304e2e 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -13,9 +13,7 @@ from pandas._libs.tslibs.util cimport get_c_string from pandas._libs.missing cimport C_NA cdef extern from "Python.h": - # Note: importing extern-style allows us to declare these as nogil - # functions, whereas `from cpython cimport` does not. - void PyErr_Clear() nogil + void PyErr_Clear() {{py: