From 430c05f67a2189d3ce901e738621493a293eedae Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 22 Dec 2019 17:30:58 -0800 Subject: [PATCH 1/4] BUG: strengthen typing in get_c_string, fix StringHashTable segfaulT --- pandas/_libs/hashtable.pxd | 2 +- pandas/_libs/hashtable_class_helper.pxi.in | 8 ++++---- pandas/_libs/tslibs/util.pxd | 11 ++++------- pandas/tests/test_algos.py | 13 +++++++++++++ 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 51ec4ba43159c..1e26fcedff5e5 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -37,7 +37,7 @@ cdef class StringHashTable(HashTable): cdef kh_str_t *table cpdef get_item(self, object val) - cpdef set_item(self, object key, Py_ssize_t val) + cpdef set_item(self, str key, Py_ssize_t val) cdef struct Int64VectorData: int64_t *data diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index b207fcb66948d..6dcd0f1e30570 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -599,7 +599,7 @@ cdef class StringHashTable(HashTable): sizeof(Py_ssize_t) + # vals sizeof(uint32_t)) # flags - cpdef get_item(self, object val): + cpdef get_item(self, str val): cdef: khiter_t k const char *v @@ -611,16 +611,16 @@ cdef class StringHashTable(HashTable): else: raise KeyError(val) - cpdef set_item(self, object key, Py_ssize_t val): + cpdef set_item(self, str key, Py_ssize_t val): cdef: khiter_t k int ret = 0 const char *v - v = get_c_string(val) + v = get_c_string(key) k = kh_put_str(self.table, v, &ret) - self.table.keys[k] = key + self.table.keys[k] = v if kh_exist_str(self.table, k): self.table.vals[k] = val else: diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 63cbd36f9cd1d..936532a81c6d6 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -218,7 +218,7 @@ cdef inline bint is_nan(object val): return is_complex_object(val) and val != val -cdef inline const char* get_c_string_buf_and_size(object py_string, +cdef inline const char* get_c_string_buf_and_size(str py_string, Py_ssize_t *length): """ Extract internal char* buffer of unicode or bytes object `py_string` with @@ -231,7 +231,7 @@ cdef inline const char* get_c_string_buf_and_size(object py_string, Parameters ---------- - py_string : object + py_string : str length : Py_ssize_t* Returns @@ -241,12 +241,9 @@ cdef inline const char* get_c_string_buf_and_size(object py_string, cdef: const char *buf - if PyUnicode_Check(py_string): - buf = PyUnicode_AsUTF8AndSize(py_string, length) - else: - PyBytes_AsStringAndSize(py_string, &buf, length) + buf = PyUnicode_AsUTF8AndSize(py_string, length) return buf -cdef inline const char* get_c_string(object py_string): +cdef inline const char* get_c_string(str py_string): return get_c_string_buf_and_size(py_string, NULL) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index e0e4beffe113a..6bee995a2a1fa 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1402,6 +1402,19 @@ class TestGroupVarFloat32(GroupVarTestMixin): class TestHashTable: + def test_string_hashtable_set_item_signature(self): + # GH#??? fix typing in StringHashTable.set_item to prevent segfault + tbl = ht.StringHashTable() + + tbl.set_item("key", 1) + assert tbl.get_item("key") == 1 + + with pytest.raises(TypeError, match="'key' has incorrect type"): + # key arg typed as string, not object + tbl.set_item(4, 6) + with pytest.raises(TypeError, match="Expected unicode, got int"): + tbl.get_item(4) + def test_lookup_nan(self, writable): xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3]) # GH 21688 ensure we can deal with readonly memory views From d64b47a024b83bc6da4e2dd898faf7528a5d7b2a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 22 Dec 2019 17:34:17 -0800 Subject: [PATCH 2/4] update get_item siganture --- pandas/_libs/hashtable.pxd | 2 +- pandas/tests/test_algos.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 1e26fcedff5e5..0499eabf708af 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -36,7 +36,7 @@ cdef class PyObjectHashTable(HashTable): cdef class StringHashTable(HashTable): cdef kh_str_t *table - cpdef get_item(self, object val) + cpdef get_item(self, str val) cpdef set_item(self, str key, Py_ssize_t val) cdef struct Int64VectorData: diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6bee995a2a1fa..92a69bfef8570 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1412,7 +1412,7 @@ def test_string_hashtable_set_item_signature(self): with pytest.raises(TypeError, match="'key' has incorrect type"): # key arg typed as string, not object tbl.set_item(4, 6) - with pytest.raises(TypeError, match="Expected unicode, got int"): + with pytest.raises(TypeError, match="'val' has incorrect type"): tbl.get_item(4) def test_lookup_nan(self, writable): From c94b3511cf509d607643b146bee150ebe072f798 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 22 Dec 2019 17:36:08 -0800 Subject: [PATCH 3/4] GH ref --- pandas/tests/test_algos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 92a69bfef8570..82f647c9385b2 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1403,7 +1403,7 @@ class TestGroupVarFloat32(GroupVarTestMixin): class TestHashTable: def test_string_hashtable_set_item_signature(self): - # GH#??? fix typing in StringHashTable.set_item to prevent segfault + # GH#30419 fix typing in StringHashTable.set_item to prevent segfault tbl = ht.StringHashTable() tbl.set_item("key", 1) From e8b338806e5b04dd8775e49b90c6b364fb9d74d7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 23 Dec 2019 14:39:56 -0800 Subject: [PATCH 4/4] compat for np.str_ --- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 6dcd0f1e30570..7d57c67e70b58 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -784,7 +784,7 @@ cdef class StringHashTable(HashTable): labels[i] = na_sentinel else: # if ignore_na is False, we also stringify NaN/None/etc. - v = get_c_string(val) + v = get_c_string(val) vecs[i] = v # compute