diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 66c209efb740b..42715127b6d29 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -458,6 +458,7 @@ Other - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) +- Bug in :meth:`Series.unique` returning incorrect value for unique, non-UTF8 encodeable strings (:issue:`45929`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 97fae1d6480ce..31c5db9ed3481 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -1,8 +1,13 @@ cimport cython +from cpython.bytes cimport PyBytes_AsString from cpython.ref cimport ( Py_INCREF, PyObject, ) +from cpython.unicode cimport ( + PyUnicode_AsEncodedString, + PyUnicode_AsUTF8, +) from libc.stdlib cimport ( free, malloc, diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index e3a9102fec395..987a9bd380e68 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1172,6 +1172,7 @@ cdef class StringHashTable(HashTable): const char **vecs khiter_t k bint use_na_value + list keep_bad_unicode_refs if return_inverse: labels = np.zeros(n, dtype=np.intp) @@ -1182,6 +1183,8 @@ cdef class StringHashTable(HashTable): vecs = malloc(n * sizeof(char *)) if vecs is NULL: raise MemoryError() + # https://cython.readthedocs.io/en/latest/src/userguide/language_basics.html#caveats-when-using-a-python-string-in-a-c-context + keep_bad_unicode_refs = [] for i in range(n): val = values[i] @@ -1195,9 +1198,11 @@ cdef class StringHashTable(HashTable): else: # if ignore_na is False, we also stringify NaN/None/etc. try: - v = get_c_string(val) + v = PyUnicode_AsUTF8(val) except UnicodeEncodeError: - v = get_c_string(repr(val)) + obj = PyUnicode_AsEncodedString(val, "utf-8", "surrogatepass") + keep_bad_unicode_refs.append(obj) + v = PyBytes_AsString(obj) vecs[i] = v # compute @@ -1223,6 +1228,8 @@ cdef class StringHashTable(HashTable): idx = self.table.vals[k] labels[i] = idx + keep_bad_unicode_refs.clear() + del keep_bad_unicode_refs free(vecs) # uniques diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 3a8ed471f9dc0..790de180dd953 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -99,7 +99,6 @@ def test_nunique_null(null_obj, index_or_series_obj): assert obj.nunique(dropna=False) == max(0, num_unique_values) -@pytest.mark.single_cpu @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="decoding fails") def test_unique_bad_unicode(index_or_series): # regression test for #34550 @@ -116,6 +115,26 @@ def test_unique_bad_unicode(index_or_series): tm.assert_numpy_array_equal(result, expected) +def test_unique_bad_unicode2(index_or_series): + # regression test for #45929 + data_list = [ + "1 \udcd6a NY", + "2 \udcd6b NY", + "3 \ud800c NY", + "4 \udcd6d NY", + "5 \udcc3e NY", + ] + + obj = index_or_series(data_list) + result = obj.unique() + if isinstance(obj, pd.Index): + expected = pd.Index(data_list, dtype=object) + tm.assert_index_equal(result, expected) + else: + expected = np.array(data_list, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_nunique_dropna(dropna): # GH37566 ser = pd.Series(["yes", "yes", pd.NA, np.nan, None, pd.NaT])