Skip to content

Commit 5ce08c2

Browse files
jorisvandenbosscheMeeseeksDev[bot]
authored and
MeeseeksDev[bot]
committed
Backport PR pandas-dev#25592: BUG: fix usage of na_sentinel with sort=True in factorize()
1 parent c53c9d1 commit 5ce08c2

File tree

3 files changed

+29
-7
lines changed

3 files changed

+29
-7
lines changed

doc/source/whatsnew/v0.24.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Fixed Regressions
3333
- Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`)
3434
- Fixed regression in :func:`to_timedelta` losing precision when converting floating data to ``Timedelta`` data (:issue:`25077`).
3535
- Fixed pip installing from source into an environment without NumPy (:issue:`25193`)
36+
- Fixed regression in :func:`factorize` when passing a custom ``na_sentinel`` value with ``sort=True`` (:issue:`25409`).
3637
- Fixed regression in :meth:`DataFrame.to_csv` writing duplicate line endings with gzip compress (:issue:`25311`)
3738

3839
.. _whatsnew_0242.bug_fixes:

pandas/core/algorithms.py

+13-7
Original file line numberDiff line numberDiff line change
@@ -614,13 +614,19 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
614614

615615
if sort and len(uniques) > 0:
616616
from pandas.core.sorting import safe_sort
617-
try:
618-
order = uniques.argsort()
619-
order2 = order.argsort()
620-
labels = take_1d(order2, labels, fill_value=na_sentinel)
621-
uniques = uniques.take(order)
622-
except TypeError:
623-
# Mixed types, where uniques.argsort fails.
617+
if na_sentinel == -1:
618+
# GH-25409 take_1d only works for na_sentinels of -1
619+
try:
620+
order = uniques.argsort()
621+
order2 = order.argsort()
622+
labels = take_1d(order2, labels, fill_value=na_sentinel)
623+
uniques = uniques.take(order)
624+
except TypeError:
625+
# Mixed types, where uniques.argsort fails.
626+
uniques, labels = safe_sort(uniques, labels,
627+
na_sentinel=na_sentinel,
628+
assume_unique=True)
629+
else:
624630
uniques, labels = safe_sort(uniques, labels,
625631
na_sentinel=na_sentinel,
626632
assume_unique=True)

pandas/tests/test_algos.py

+15
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,21 @@ def test_parametrized_factorize_na_value(self, data, na_value):
321321
tm.assert_numpy_array_equal(l, expected_labels)
322322
tm.assert_numpy_array_equal(u, expected_uniques)
323323

324+
@pytest.mark.parametrize('sort', [True, False])
325+
@pytest.mark.parametrize('na_sentinel', [-1, -10, 100])
326+
def test_factorize_na_sentinel(self, sort, na_sentinel):
327+
data = np.array(['b', 'a', None, 'b'], dtype=object)
328+
labels, uniques = algos.factorize(data, sort=sort,
329+
na_sentinel=na_sentinel)
330+
if sort:
331+
expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp)
332+
expected_uniques = np.array(['a', 'b'], dtype=object)
333+
else:
334+
expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp)
335+
expected_uniques = np.array(['b', 'a'], dtype=object)
336+
tm.assert_numpy_array_equal(labels, expected_labels)
337+
tm.assert_numpy_array_equal(uniques, expected_uniques)
338+
324339

325340
class TestUnique(object):
326341

0 commit comments

Comments
 (0)