Skip to content

BUG: strengthen typing in get_c_string, fix StringHashTable segfault #30419

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Dec 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pandas/_libs/hashtable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ cdef class PyObjectHashTable(HashTable):
cdef class StringHashTable(HashTable):
cdef kh_str_t *table

cpdef get_item(self, object val)
cpdef set_item(self, object key, Py_ssize_t val)
cpdef get_item(self, str val)
cpdef set_item(self, str key, Py_ssize_t val)

cdef struct Int64VectorData:
int64_t *data
Expand Down
10 changes: 5 additions & 5 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,7 @@ cdef class StringHashTable(HashTable):
sizeof(Py_ssize_t) + # vals
sizeof(uint32_t)) # flags

cpdef get_item(self, object val):
cpdef get_item(self, str val):
cdef:
khiter_t k
const char *v
Expand All @@ -611,16 +611,16 @@ cdef class StringHashTable(HashTable):
else:
raise KeyError(val)

cpdef set_item(self, object key, Py_ssize_t val):
cpdef set_item(self, str key, Py_ssize_t val):
cdef:
khiter_t k
int ret = 0
const char *v

v = get_c_string(val)
v = get_c_string(key)

k = kh_put_str(self.table, v, &ret)
self.table.keys[k] = key
self.table.keys[k] = v
if kh_exist_str(self.table, k):
self.table.vals[k] = val
else:
Expand Down Expand Up @@ -784,7 +784,7 @@ cdef class StringHashTable(HashTable):
labels[i] = na_sentinel
else:
# if ignore_na is False, we also stringify NaN/None/etc.
v = get_c_string(val)
v = get_c_string(<str>val)
vecs[i] = v

# compute
Expand Down
11 changes: 4 additions & 7 deletions pandas/_libs/tslibs/util.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ cdef inline bint is_nan(object val):
return is_complex_object(val) and val != val


cdef inline const char* get_c_string_buf_and_size(object py_string,
cdef inline const char* get_c_string_buf_and_size(str py_string,
Py_ssize_t *length):
"""
Extract internal char* buffer of unicode or bytes object `py_string` with
Expand All @@ -231,7 +231,7 @@ cdef inline const char* get_c_string_buf_and_size(object py_string,

Parameters
----------
py_string : object
py_string : str
length : Py_ssize_t*

Returns
Expand All @@ -241,12 +241,9 @@ cdef inline const char* get_c_string_buf_and_size(object py_string,
cdef:
const char *buf

if PyUnicode_Check(py_string):
buf = PyUnicode_AsUTF8AndSize(py_string, length)
else:
PyBytes_AsStringAndSize(py_string, <char**>&buf, length)
buf = PyUnicode_AsUTF8AndSize(py_string, length)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm interesting - so I guess this helper function existed in the first place for 2/3 compat? If this is all it is reduced to why not get rid of this function altogether and just replace with PyUnicode_AsUTF8AndSize?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so I guess this helper function existed in the first place for 2/3

thats my guess, yah.

just replace with PyUnicode_AsUTF8AndSize?

for now at least because we need to figure out what to do about the segfault thing; the imlpementation here might end up changing

return buf


cdef inline const char* get_c_string(object py_string):
cdef inline const char* get_c_string(str py_string):
return get_c_string_buf_and_size(py_string, NULL)
13 changes: 13 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -1402,6 +1402,19 @@ class TestGroupVarFloat32(GroupVarTestMixin):


class TestHashTable:
def test_string_hashtable_set_item_signature(self):
# GH#30419 fix typing in StringHashTable.set_item to prevent segfault
tbl = ht.StringHashTable()

tbl.set_item("key", 1)
assert tbl.get_item("key") == 1

with pytest.raises(TypeError, match="'key' has incorrect type"):
# key arg typed as string, not object
tbl.set_item(4, 6)
with pytest.raises(TypeError, match="'val' has incorrect type"):
tbl.get_item(4)

def test_lookup_nan(self, writable):
xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
# GH 21688 ensure we can deal with readonly memory views
Expand Down