Skip to content

Commit a8fad16

Browse files
BUG: fix usage of na_sentinel with sort=True in factorize() (#25592)
1 parent 5c341dc commit a8fad16

File tree

3 files changed

+29
-7
lines changed

3 files changed

+29
-7
lines changed

doc/source/whatsnew/v0.24.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Fixed Regressions
3333
- Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`)
3434
- Fixed regression in :func:`to_timedelta` losing precision when converting floating data to ``Timedelta`` data (:issue:`25077`).
3535
- Fixed pip installing from source into an environment without NumPy (:issue:`25193`)
36+
- Fixed regression in :func:`factorize` when passing a custom ``na_sentinel`` value with ``sort=True`` (:issue:`25409`).
3637
- Fixed regression in :meth:`DataFrame.to_csv` writing duplicate line endings with gzip compress (:issue:`25311`)
3738

3839
.. _whatsnew_0242.bug_fixes:

pandas/core/algorithms.py

+13-7
Original file line numberDiff line numberDiff line change
@@ -619,13 +619,19 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
619619

620620
if sort and len(uniques) > 0:
621621
from pandas.core.sorting import safe_sort
622-
try:
623-
order = uniques.argsort()
624-
order2 = order.argsort()
625-
labels = take_1d(order2, labels, fill_value=na_sentinel)
626-
uniques = uniques.take(order)
627-
except TypeError:
628-
# Mixed types, where uniques.argsort fails.
622+
if na_sentinel == -1:
623+
# GH-25409 take_1d only works for na_sentinels of -1
624+
try:
625+
order = uniques.argsort()
626+
order2 = order.argsort()
627+
labels = take_1d(order2, labels, fill_value=na_sentinel)
628+
uniques = uniques.take(order)
629+
except TypeError:
630+
# Mixed types, where uniques.argsort fails.
631+
uniques, labels = safe_sort(uniques, labels,
632+
na_sentinel=na_sentinel,
633+
assume_unique=True)
634+
else:
629635
uniques, labels = safe_sort(uniques, labels,
630636
na_sentinel=na_sentinel,
631637
assume_unique=True)

pandas/tests/test_algos.py

+15
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,21 @@ def test_parametrized_factorize_na_value(self, data, na_value):
326326
tm.assert_numpy_array_equal(l, expected_labels)
327327
tm.assert_numpy_array_equal(u, expected_uniques)
328328

329+
@pytest.mark.parametrize('sort', [True, False])
330+
@pytest.mark.parametrize('na_sentinel', [-1, -10, 100])
331+
def test_factorize_na_sentinel(self, sort, na_sentinel):
332+
data = np.array(['b', 'a', None, 'b'], dtype=object)
333+
labels, uniques = algos.factorize(data, sort=sort,
334+
na_sentinel=na_sentinel)
335+
if sort:
336+
expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp)
337+
expected_uniques = np.array(['a', 'b'], dtype=object)
338+
else:
339+
expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp)
340+
expected_uniques = np.array(['b', 'a'], dtype=object)
341+
tm.assert_numpy_array_equal(labels, expected_labels)
342+
tm.assert_numpy_array_equal(uniques, expected_uniques)
343+
329344

330345
class TestUnique(object):
331346

0 commit comments

Comments
 (0)