Skip to content

Commit 1366dbf

Browse files
roberthdevriesjbrockmendel
authored andcommitted
BUG: Fix segfault on dir of a DataFrame with a unicode surrogate character in the column name (pandas-dev#32701)
1 parent 6dc3ac6 commit 1366dbf

File tree

4 files changed

+16
-1
lines changed

4 files changed

+16
-1
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,7 @@ Other
405405
- Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`).
406406
- :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:32538`)
407407
- Fixed bug in :func:`pandas.testing.assert_series_equal` where dtypes were checked for ``Interval`` and ``ExtensionArray`` operands when ``check_dtype`` was ``False`` (:issue:`32747`)
408+
- Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`)
408409

409410
.. ---------------------------------------------------------------------------
410411

pandas/_libs/hashtable_class_helper.pxi.in

+6
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
1212
from pandas._libs.tslibs.util cimport get_c_string
1313
from pandas._libs.missing cimport C_NA
1414

15+
cdef extern from "Python.h":
16+
void PyErr_Clear()
17+
1518
{{py:
1619

1720
# name, dtype, c_type
@@ -790,6 +793,9 @@ cdef class StringHashTable(HashTable):
790793
else:
791794
# if ignore_na is False, we also stringify NaN/None/etc.
792795
v = get_c_string(<str>val)
796+
if v == NULL:
797+
PyErr_Clear()
798+
v = get_c_string(<str>repr(val))
793799
vecs[i] = v
794800

795801
# compute

pandas/tests/frame/test_api.py

+8
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,14 @@ def test_not_hashable(self):
127127
with pytest.raises(TypeError, match=msg):
128128
hash(empty_frame)
129129

130+
def test_column_name_contains_unicode_surrogate(self):
131+
# GH 25509
132+
colname = "\ud83d"
133+
df = DataFrame({colname: []})
134+
# this should not crash
135+
assert colname not in dir(df)
136+
assert df.columns[0] == colname
137+
130138
def test_new_empty_index(self):
131139
df1 = DataFrame(np.random.randn(0, 3))
132140
df2 = DataFrame(np.random.randn(0, 3))

pandas/tests/io/parser/test_dtypes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
192192
pth = os.path.join(csv_dir_path, "utf16_ex.txt")
193193
parser = all_parsers
194194
encoding = "utf-16"
195-
sep = ","
195+
sep = "\t"
196196

197197
expected = parser.read_csv(pth, sep=sep, encoding=encoding)
198198
expected = expected.apply(Categorical)

0 commit comments

Comments
 (0)