diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 48aee18c90456..8127ce1fa6eda 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -473,6 +473,7 @@ Other - Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) - Bug in :meth:`Dataframe.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`) +- Bug in Cython :meth:`StringHashTable._unique` used ephemeral repr values when UnicodeEncodeError was raised (:issue:`45929`) - Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 1cf5d734705af..79a0dfc6d723d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1128,6 +1128,7 @@ cdef class StringHashTable(HashTable): use_na_value = na_value is not None # assign pointers and pre-filter out missing (if ignore_na) + keep_rval_refs = [] vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] @@ -1144,7 +1145,9 @@ cdef class StringHashTable(HashTable): try: v = get_c_string(val) except UnicodeEncodeError: - v = get_c_string(repr(val)) + rval = repr(val) + keep_rval_refs.append(rval) + v = get_c_string(rval) vecs[i] = v # compute diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 4c845d8f24d01..fc9d6fedf20f0 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -97,7 +97,6 @@ def test_nunique_null(null_obj, index_or_series_obj): assert obj.nunique(dropna=False) == max(0, num_unique_values) -@pytest.mark.single_cpu def test_unique_bad_unicode(index_or_series): # regression test for #34550 uval = "\ud83d" # smiley emoji @@ -113,6 +112,24 @@ def test_unique_bad_unicode(index_or_series): tm.assert_numpy_array_equal(result, expected) +def test_unique_45929(index_or_series): + # regression test for #45929 + data_list = [ + "1 \udcd6a NY", + "2 \udcd6b NY", + "3 \ud800c NY", + "4 \udcd6d NY", + "5 \udcc3e NY", + ] + + obj = index_or_series(data_list) + assert len(obj.unique()) == len(data_list) + assert len(obj.value_counts()) == len(data_list) + assert len(np.unique(data_list)) == len(data_list) + assert len(set(data_list)) == len(data_list) + assert obj.is_unique + + @pytest.mark.parametrize("dropna", [True, False]) def test_nunique_dropna(dropna): # GH37566