diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 0d3a9a8f969a4..b97dd580e2d8a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -402,6 +402,7 @@ Other - Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`). - :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:32538`) - Fixed bug in :func:`pandas.testing.assert_series_equal` where dtypes were checked for ``Interval`` and ``ExtensionArray`` operands when ``check_dtype`` was ``False`` (:issue:`32747`) +- Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 811025a4b5764..d662e03304e2e 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -12,6 +12,9 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in from pandas._libs.tslibs.util cimport get_c_string from pandas._libs.missing cimport C_NA +cdef extern from "Python.h": + void PyErr_Clear() + {{py: # name, dtype, c_type @@ -790,6 +793,9 @@ cdef class StringHashTable(HashTable): else: # if ignore_na is False, we also stringify NaN/None/etc. v = get_c_string(val) + if v == NULL: + PyErr_Clear() + v = get_c_string(repr(val)) vecs[i] = v # compute diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index a021dd91a7d26..940a76601b75e 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -127,6 +127,14 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) + def test_column_name_contains_unicode_surrogate(self): + # GH 25509 + colname = "\ud83d" + df = DataFrame({colname: []}) + # this should not crash + assert colname not in dir(df) + assert df.columns[0] == colname + def test_new_empty_index(self): df1 = DataFrame(np.random.randn(0, 3)) df2 = DataFrame(np.random.randn(0, 3)) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 11dcf7f04f76b..e68dcb3aa577e 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -192,7 +192,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path): pth = os.path.join(csv_dir_path, "utf16_ex.txt") parser = all_parsers encoding = "utf-16" - sep = "," + sep = "\t" expected = parser.read_csv(pth, sep=sep, encoding=encoding) expected = expected.apply(Categorical)