Skip to content

BUG: pandas.Series.unique() does not return correct unique values on non UTF8 enodeable strings #58215

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,7 @@ Other
- Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
- Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
- Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`)
- Bug in :meth:`Series.unique` returning incorrect value for unique, non-UTF8 encodeable strings (:issue:`45929`)
- Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)

.. ***DO NOT USE THIS SECTION***
Expand Down
5 changes: 5 additions & 0 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
cimport cython
from cpython.bytes cimport PyBytes_AsString
from cpython.ref cimport (
Py_INCREF,
PyObject,
)
from cpython.unicode cimport (
PyUnicode_AsEncodedString,
PyUnicode_AsUTF8,
)
from libc.stdlib cimport (
free,
malloc,
Expand Down
11 changes: 9 additions & 2 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -1172,6 +1172,7 @@ cdef class StringHashTable(HashTable):
const char **vecs
khiter_t k
bint use_na_value
list keep_bad_unicode_refs

if return_inverse:
labels = np.zeros(n, dtype=np.intp)
Expand All @@ -1182,6 +1183,8 @@ cdef class StringHashTable(HashTable):
vecs = <const char **>malloc(n * sizeof(char *))
if vecs is NULL:
raise MemoryError()
# https://cython.readthedocs.io/en/latest/src/userguide/language_basics.html#caveats-when-using-a-python-string-in-a-c-context
keep_bad_unicode_refs = []
for i in range(n):
val = values[i]

Expand All @@ -1195,9 +1198,11 @@ cdef class StringHashTable(HashTable):
else:
# if ignore_na is False, we also stringify NaN/None/etc.
try:
v = get_c_string(<str>val)
v = PyUnicode_AsUTF8(<str>val)
except UnicodeEncodeError:
v = get_c_string(<str>repr(val))
obj = PyUnicode_AsEncodedString(<str>val, "utf-8", "surrogatepass")
keep_bad_unicode_refs.append(obj)
v = PyBytes_AsString(obj)
vecs[i] = v

# compute
Expand All @@ -1223,6 +1228,8 @@ cdef class StringHashTable(HashTable):
idx = self.table.vals[k]
labels[i] = idx

keep_bad_unicode_refs.clear()
del keep_bad_unicode_refs
free(vecs)

# uniques
Expand Down
21 changes: 20 additions & 1 deletion pandas/tests/base/test_unique.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ def test_nunique_null(null_obj, index_or_series_obj):
assert obj.nunique(dropna=False) == max(0, num_unique_values)


@pytest.mark.single_cpu
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="decoding fails")
def test_unique_bad_unicode(index_or_series):
# regression test for #34550
Expand All @@ -116,6 +115,26 @@ def test_unique_bad_unicode(index_or_series):
tm.assert_numpy_array_equal(result, expected)


def test_unique_bad_unicode2(index_or_series):
# regression test for #45929
data_list = [
"1 \udcd6a NY",
"2 \udcd6b NY",
"3 \ud800c NY",
"4 \udcd6d NY",
"5 \udcc3e NY",
]

obj = index_or_series(data_list)
result = obj.unique()
if isinstance(obj, pd.Index):
expected = pd.Index(data_list, dtype=object)
tm.assert_index_equal(result, expected)
else:
expected = np.array(data_list, dtype=object)
tm.assert_numpy_array_equal(result, expected)


def test_nunique_dropna(dropna):
# GH37566
ser = pd.Series(["yes", "yes", pd.NA, np.nan, None, pd.NaT])
Expand Down