From c300b46e33be8b839937bd2cae96a2d3846d4fa0 Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Sun, 15 Oct 2023 08:15:19 -0400 Subject: [PATCH 1/8] Resolve Heisenbug in StringHashTable._unique When processing an invalid Unicode string, the exception handler for UnicodeEncodeError called `get_c_string` with an ephemeral repr value that could be garbage-collected the next time an exception was raised. Issue #45929 demonstrates the problem. This commit fixes the problem by retaining a Python reference to the repr value that underlies the C string until after all `values` are processed. Wisdom from StackOverflow suggests that there's very small performance difference between pre-allocating the array vs. append if indeed we do need to fill it all the way, but because we only need references on exceptions, we expect that in the usual case we will append very few elements, making it faster than pre-allocation. Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/_libs/hashtable_class_helper.pxi.in | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ef59c86a21598..bf396b8124136 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -403,6 +403,7 @@ Other ^^^^^ - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) +- Bug in Cython :meth:`StringHashTable._unique` used ephemeral repr values when UnicodeEncodeError was raised (:issue:`45929`) - Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) - diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 1cf5d734705af..79a0dfc6d723d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1128,6 +1128,7 @@ cdef class StringHashTable(HashTable): use_na_value = na_value is not None # assign pointers and pre-filter out missing (if ignore_na) + keep_rval_refs = [] vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] @@ -1144,7 +1145,9 @@ cdef class StringHashTable(HashTable): try: v = get_c_string(val) except UnicodeEncodeError: - v = get_c_string(repr(val)) + rval = repr(val) + keep_rval_refs.append(rval) + v = get_c_string(rval) vecs[i] = v # compute From b4157f069c41b20847c1a4c2595dc304bb3b5ad1 Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Sun, 15 Oct 2023 08:32:16 -0400 Subject: [PATCH 2/8] Added test for #45929 and removed superfluous single_cpu mark The `single_cpu` attribute for `test_unique_bad_unicode` was likely an attempt to cover over the underlying bug fixed with this commit. We can now run this test in the usual fashion. Added a test case for the problem reported in 45929. Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- pandas/tests/base/test_unique.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 4c845d8f24d01..fc9d6fedf20f0 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -97,7 +97,6 @@ def test_nunique_null(null_obj, index_or_series_obj): assert obj.nunique(dropna=False) == max(0, num_unique_values) -@pytest.mark.single_cpu def test_unique_bad_unicode(index_or_series): # regression test for #34550 uval = "\ud83d" # smiley emoji @@ -113,6 +112,24 @@ def test_unique_bad_unicode(index_or_series): tm.assert_numpy_array_equal(result, expected) +def test_unique_45929(index_or_series): + # regression test for #45929 + data_list = [ + "1 \udcd6a NY", + "2 \udcd6b NY", + "3 \ud800c NY", + "4 \udcd6d NY", + "5 \udcc3e NY", + ] + + obj = index_or_series(data_list) + assert len(obj.unique()) == len(data_list) + assert len(obj.value_counts()) == len(data_list) + assert len(np.unique(data_list)) == len(data_list) + assert len(set(data_list)) == len(data_list) + assert obj.is_unique + + @pytest.mark.parametrize("dropna", [True, False]) def test_nunique_dropna(dropna): # GH37566 From f33cdc073fe9fd744f95f74a6d815b264ce4384d Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Thu, 16 Nov 2023 15:11:16 +1300 Subject: [PATCH 3/8] Update v2.2.0.rst Correctly alphabetize items in `Other` list. Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 10d58fc814835..8127ce1fa6eda 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -472,8 +472,8 @@ Other - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) - Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) -- Bug in Cython :meth:`StringHashTable._unique` used ephemeral repr values when UnicodeEncodeError was raised (:issue:`45929`) - Bug in :meth:`Dataframe.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`) +- Bug in Cython :meth:`StringHashTable._unique` used ephemeral repr values when UnicodeEncodeError was raised (:issue:`45929`) - Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) From 9abe522674710767087f9d7c6aff876ee983a603 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 10 Apr 2024 15:24:22 -0700 Subject: [PATCH 4/8] Add comment and clear our references --- pandas/_libs/hashtable_class_helper.pxi.in | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 491f08ae4f29a..c899d3005fa2e 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1179,7 +1179,8 @@ cdef class StringHashTable(HashTable): use_na_value = na_value is not None # assign pointers and pre-filter out missing (if ignore_na) - keep_rval_refs = [] + # https://cython.readthedocs.io/en/latest/src/userguide/language_basics.html#caveats-when-using-a-python-string-in-a-c-context + keep_bad_unicode_refs = [] vecs = malloc(n * sizeof(char *)) if vecs is NULL: raise MemoryError() @@ -1199,7 +1200,7 @@ cdef class StringHashTable(HashTable): v = get_c_string(val) except UnicodeEncodeError: rval = repr(val) - keep_rval_refs.append(rval) + keep_bad_unicode_refs.append(rval) v = get_c_string(rval) vecs[i] = v @@ -1226,6 +1227,8 @@ cdef class StringHashTable(HashTable): idx = self.table.vals[k] labels[i] = idx + keep_bad_unicode_refs.clear() + del keep_bad_unicode_refs free(vecs) # uniques From 043fc353061a2281a6807acc7c416069b976383b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 10 Apr 2024 15:27:45 -0700 Subject: [PATCH 5/8] Add whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 66c209efb740b..42715127b6d29 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -458,6 +458,7 @@ Other - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) +- Bug in :meth:`Series.unique` returning incorrect value for unique, non-UTF8 encodeable strings (:issue:`45929`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) .. ***DO NOT USE THIS SECTION*** From 0de9f5f782316be23dd41dff4e217c3e5e0c2c25 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 10 Apr 2024 15:30:37 -0700 Subject: [PATCH 6/8] More exact test --- pandas/tests/base/test_unique.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 86156a0b0eed9..ab1c8ab168dd8 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -126,7 +126,13 @@ def test_unique_bad_unicode2(index_or_series): ] obj = index_or_series(data_list) - assert len(obj.unique()) == len(obj) + result = obj.unique() + if isinstance(obj, pd.Index): + expected = pd.Index(data_list, dtype=object) + tm.assert_index_equal(result, expected) + else: + expected = np.array(data_list, dtype=object) + tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("dropna", [True, False]) From 5d87ebc7ee08b569b22b53d9a2aec1cc04808686 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Apr 2024 12:01:46 -0700 Subject: [PATCH 7/8] Use CAPIs --- pandas/_libs/hashtable.pyx | 5 +++++ pandas/_libs/hashtable_class_helper.pxi.in | 13 +++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 97fae1d6480ce..31c5db9ed3481 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -1,8 +1,13 @@ cimport cython +from cpython.bytes cimport PyBytes_AsString from cpython.ref cimport ( Py_INCREF, PyObject, ) +from cpython.unicode cimport ( + PyUnicode_AsEncodedString, + PyUnicode_AsUTF8, +) from libc.stdlib cimport ( free, malloc, diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c899d3005fa2e..987a9bd380e68 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1172,6 +1172,7 @@ cdef class StringHashTable(HashTable): const char **vecs khiter_t k bint use_na_value + list keep_bad_unicode_refs if return_inverse: labels = np.zeros(n, dtype=np.intp) @@ -1179,11 +1180,11 @@ cdef class StringHashTable(HashTable): use_na_value = na_value is not None # assign pointers and pre-filter out missing (if ignore_na) - # https://cython.readthedocs.io/en/latest/src/userguide/language_basics.html#caveats-when-using-a-python-string-in-a-c-context - keep_bad_unicode_refs = [] vecs = malloc(n * sizeof(char *)) if vecs is NULL: raise MemoryError() + # https://cython.readthedocs.io/en/latest/src/userguide/language_basics.html#caveats-when-using-a-python-string-in-a-c-context + keep_bad_unicode_refs = [] for i in range(n): val = values[i] @@ -1197,11 +1198,11 @@ cdef class StringHashTable(HashTable): else: # if ignore_na is False, we also stringify NaN/None/etc. try: - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) except UnicodeEncodeError: - rval = repr(val) - keep_bad_unicode_refs.append(rval) - v = get_c_string(rval) + obj = PyUnicode_AsEncodedString(val, "utf-8", "surrogatepass") + keep_bad_unicode_refs.append(obj) + v = PyBytes_AsString(obj) vecs[i] = v # compute From b78c6cb3c93fd074e141b1de02d26213e00cca40 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Apr 2024 12:26:06 -0700 Subject: [PATCH 8/8] Remove param --- pandas/tests/base/test_unique.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index ab1c8ab168dd8..790de180dd953 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -135,7 +135,6 @@ def test_unique_bad_unicode2(index_or_series): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("dropna", [True, False]) def test_nunique_dropna(dropna): # GH37566 ser = pd.Series(["yes", "yes", pd.NA, np.nan, None, pd.NaT])