From 73569971bc029db80499589c26dfa70fef9c0d67 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Mar 2019 15:55:07 +0100 Subject: [PATCH 1/2] BUG: fix usage of na_sentinel with sort=True in factorize() --- doc/source/whatsnew/v0.24.2.rst | 1 + pandas/core/algorithms.py | 20 +++++++++++++------- pandas/tests/test_algos.py | 15 +++++++++++++++ 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 7da99590d5a0a..839754b828186 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -32,6 +32,7 @@ Fixed Regressions - Fixed regression in creating a period-dtype array from a read-only NumPy array of period objects. (:issue:`25403`) - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) - Fixed pip installing from source into an environment without NumPy (:issue:`25193`) +- Fixed regression in :func:`factorize` when passing a custom ``na_sentinel`` value with ``sort=True`` (:issue:`25409`). .. _whatsnew_0242.enhancements: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4a71951e2435e..5ed2e3efe26a1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -619,13 +619,19 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort - try: - order = uniques.argsort() - order2 = order.argsort() - labels = take_1d(order2, labels, fill_value=na_sentinel) - uniques = uniques.take(order) - except TypeError: - # Mixed types, where uniques.argsort fails. + if na_sentinel == -1: + # GH-25409 take_1d only works for na_sentinels of -1 + try: + order = uniques.argsort() + order2 = order.argsort() + labels = take_1d(order2, labels, fill_value=na_sentinel) + uniques = uniques.take(order) + except TypeError: + # Mixed types, where uniques.argsort fails. + uniques, labels = safe_sort(uniques, labels, + na_sentinel=na_sentinel, + assume_unique=True) + else: uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3f75c508d22f9..7c009f6a2633c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -326,6 +326,21 @@ def test_parametrized_factorize_na_value(self, data, na_value): tm.assert_numpy_array_equal(l, expected_labels) tm.assert_numpy_array_equal(u, expected_uniques) + @pytest.mark.parametrize('sort', [True, False]) + @pytest.mark.parametrize('na_sentinel', [-1, -10, 100]) + def test_factorize_na_sentinel(self, sort, na_sentinel): + data = np.array(['b', 'a', None, 'b'], dtype=object) + labels, uniques = algos.factorize(data, sort=sort, + na_sentinel=na_sentinel) + if sort: + expected_labels = np.array([1, 0, na_sentinel, 1]) + expected_uniques = np.array(['a', 'b'], dtype=object) + else: + expected_labels = np.array([0, 1, na_sentinel, 0]) + expected_uniques = np.array(['b', 'a'], dtype=object) + tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(uniques, expected_uniques) + class TestUnique(object): From e1ab3a489b1a5825527312d53dcc20f1545f04d6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Mar 2019 18:20:13 +0100 Subject: [PATCH 2/2] fix dtype --- pandas/tests/test_algos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 7c009f6a2633c..083307371b699 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -333,10 +333,10 @@ def test_factorize_na_sentinel(self, sort, na_sentinel): labels, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) if sort: - expected_labels = np.array([1, 0, na_sentinel, 1]) + expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp) expected_uniques = np.array(['a', 'b'], dtype=object) else: - expected_labels = np.array([0, 1, na_sentinel, 0]) + expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp) expected_uniques = np.array(['b', 'a'], dtype=object) tm.assert_numpy_array_equal(labels, expected_labels) tm.assert_numpy_array_equal(uniques, expected_uniques)