pandas-dev · mroeschke · Oct 15, 2023 · Oct 15, 2023 · Nov 16, 2023 · Nov 16, 2023
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -458,6 +458,7 @@ Other
 - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
 - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
 - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`)
+- Bug in :meth:`Series.unique` returning incorrect value for unique, non-UTF8 encodeable strings (:issue:`45929`)
 - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
 
 .. ***DO NOT USE THIS SECTION***

diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
@@ -1,8 +1,13 @@
 cimport cython
+from cpython.bytes cimport PyBytes_AsString
 from cpython.ref cimport (
     Py_INCREF,
     PyObject,
 )
+from cpython.unicode cimport (
+    PyUnicode_AsEncodedString,
+    PyUnicode_AsUTF8,
+)
 from libc.stdlib cimport (
     free,
     malloc,

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -1172,6 +1172,7 @@ cdef class StringHashTable(HashTable):
             const char **vecs
             khiter_t k
             bint use_na_value
+            list keep_bad_unicode_refs
 
         if return_inverse:
             labels = np.zeros(n, dtype=np.intp)
@@ -1182,6 +1183,8 @@ cdef class StringHashTable(HashTable):
         vecs = <const char **>malloc(n * sizeof(char *))
         if vecs is NULL:
             raise MemoryError()
+        # https://cython.readthedocs.io/en/latest/src/userguide/language_basics.html#caveats-when-using-a-python-string-in-a-c-context
+        keep_bad_unicode_refs = []
         for i in range(n):
             val = values[i]
 
@@ -1195,9 +1198,11 @@ cdef class StringHashTable(HashTable):
             else:
                 # if ignore_na is False, we also stringify NaN/None/etc.
                 try:
-                    v = get_c_string(<str>val)
+                    v = PyUnicode_AsUTF8(<str>val)
                 except UnicodeEncodeError:
-                    v = get_c_string(<str>repr(val))
+                    obj = PyUnicode_AsEncodedString(<str>val, "utf-8", "surrogatepass")
+                    keep_bad_unicode_refs.append(obj)
+                    v = PyBytes_AsString(obj)
                 vecs[i] = v
 
         # compute
@@ -1223,6 +1228,8 @@ cdef class StringHashTable(HashTable):
                     idx = self.table.vals[k]
                     labels[i] = idx
 
+        keep_bad_unicode_refs.clear()
+        del keep_bad_unicode_refs
         free(vecs)
 
         # uniques

diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py
@@ -99,7 +99,6 @@ def test_nunique_null(null_obj, index_or_series_obj):
         assert obj.nunique(dropna=False) == max(0, num_unique_values)
 
 
-@pytest.mark.single_cpu
 @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="decoding fails")
 def test_unique_bad_unicode(index_or_series):
     # regression test for #34550
@@ -116,6 +115,26 @@ def test_unique_bad_unicode(index_or_series):
         tm.assert_numpy_array_equal(result, expected)
 
 
+def test_unique_bad_unicode2(index_or_series):
+    # regression test for #45929
+    data_list = [
+        "1 \udcd6a NY",
+        "2 \udcd6b NY",
+        "3 \ud800c NY",
+        "4 \udcd6d NY",
+        "5 \udcc3e NY",
+    ]
+
+    obj = index_or_series(data_list)
+    result = obj.unique()
+    if isinstance(obj, pd.Index):
+        expected = pd.Index(data_list, dtype=object)
+        tm.assert_index_equal(result, expected)
+    else:
+        expected = np.array(data_list, dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+
 def test_nunique_dropna(dropna):
     # GH37566
     ser = pd.Series(["yes", "yes", pd.NA, np.nan, None, pd.NaT])