From 0b3d22d5b28ba110a79b364450a2a9c1d8846dc0 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 30 Jun 2019 00:14:46 +0800 Subject: [PATCH 01/10] Use nargsort inside EA.argsort to sort --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/arrays/base.py | 11 ++++++----- pandas/core/arrays/categorical.py | 10 +++++++--- pandas/tests/extension/base/methods.py | 5 +++++ pandas/tests/extension/test_categorical.py | 5 +++++ 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 83a90443b24cb..b09b42362ab64 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -562,6 +562,7 @@ Other API changes - Removed support of gtk package for clipboards (:issue:`26563`) - Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`) - :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` will now raise a ``ValueError`` when saving timezone aware data. (:issue:`27008`, :issue:`7056`) +- :meth:`ExtensionArray.argsort` places NA values at the end of the sorted array, except :class:`Categorical` (:issue:`21801`) .. _whatsnew_0250.deprecations: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 6340cc732d6c1..1b8445d3710a5 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -23,6 +23,7 @@ from pandas._typing import ArrayLike from pandas.core import ops +from pandas.core.sorting import nargsort _not_implemented_message = "{} does not implement {}." @@ -398,7 +399,8 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): Returns ------- index_array : ndarray - Array of indices that sort ``self``. + Array of indices that sort ``self``. If NaN values are contained, + NaN values are placed at the end. See Also -------- @@ -409,10 +411,9 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): # 1. _values_for_argsort : construct the values passed to np.argsort # 2. argsort : total control over sorting. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - values = self._values_for_argsort() - result = np.argsort(values, kind=kind, **kwargs) - if not ascending: - result = result[::-1] + + result = nargsort(self, kind=kind, ascending=ascending, + na_position='last') return result def fillna(self, value=None, method=None, limit=None): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3ef2f41f25338..432f6d5f8af8e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1509,7 +1509,7 @@ def check_for_ordered(self, op): def _values_for_argsort(self): return self._codes.copy() - def argsort(self, *args, **kwargs): + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): # TODO(PY2): use correct signature # We have to do *args, **kwargs to avoid a a py2-only signature # issue since np.argsort differs from argsort. @@ -1553,8 +1553,12 @@ def argsort(self, *args, **kwargs): >>> cat.argsort() array([3, 0, 1, 2]) """ - # Keep the implementation here just for the docstring. - return super().argsort(*args, **kwargs) + ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) + values = self._values_for_argsort() + result = np.argsort(values, kind=kind, **kwargs) + if not ascending: + result = result[::-1] + return result def sort_values(self, inplace=False, ascending=True, na_position='last'): """ diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index d9e61e6a227e6..e5edd124b9058 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -47,6 +47,11 @@ def test_argsort(self, data_for_sorting): expected = pd.Series(np.array([2, 0, 1], dtype=np.int64)) self.assert_series_equal(result, expected) + def test_argsort_nan_last(self, data_missing_for_sorting): + # GH 21801 + result = data_missing_for_sorting.argsort() + assert result[-1] == 1 + def test_argsort_missing(self, data_missing_for_sorting): result = pd.Series(data_missing_for_sorting).argsort() expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 4cf9f78e1531d..a169a08fa0d51 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -193,6 +193,11 @@ def test_searchsorted(self, data_for_sorting): if not data_for_sorting.ordered: raise pytest.skip(reason="searchsorted requires ordered data.") + def test_argsort_nan_last(self, data_missing_for_sorting): + # GH 21801 + # TODO: Categorical.argsort places NA values at the end + pass + class TestCasting(base.BaseCastingTests): pass From 79df1d85483843ba453e3383b9f5a27931cb8cfc Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 30 Jun 2019 13:15:20 +0800 Subject: [PATCH 02/10] Add benchmark --- asv_bench/benchmarks/algorithms.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 45ef47fde0a56..659ba206ca83d 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -141,4 +141,16 @@ def time_quantile(self, quantile, interpolation, dtype): self.idx.quantile(quantile, interpolation=interpolation) +class EASorting: + params = [10**3, 10**5] + + def setup(self, N): + data = np.arange(N, dtype=float) + data[40] = np.nan + self.array = pd.array(data, dtype='Int64') + + def time_argsort(self, N): + self.array.argsort() + + from .pandas_vb_common import setup # noqa: F401 isort:skip From a5d46ce48f21fa56db973faf04dbafa0654898ee Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 08:33:00 -0500 Subject: [PATCH 03/10] Updates * rework test * xfail categorical * rename asv * update whatsnew --- asv_bench/benchmarks/algorithms.py | 2 +- pandas/tests/extension/base/methods.py | 6 +++--- pandas/tests/extension/test_categorical.py | 7 +++---- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 659ba206ca83d..b69efb4689486 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -141,7 +141,7 @@ def time_quantile(self, quantile, interpolation, dtype): self.idx.quantile(quantile, interpolation=interpolation) -class EASorting: +class SortIntegerArray: params = [10**3, 10**5] def setup(self, N): diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index e5edd124b9058..80f824775ed83 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -47,10 +47,10 @@ def test_argsort(self, data_for_sorting): expected = pd.Series(np.array([2, 0, 1], dtype=np.int64)) self.assert_series_equal(result, expected) - def test_argsort_nan_last(self, data_missing_for_sorting): - # GH 21801 + def test_argsort_missing_array(self, data_missing_for_sorting): result = data_missing_for_sorting.argsort() - assert result[-1] == 1 + expected = np.array([2, 0, 1]) + tm.assert_numpy_array_equal(result, expected) def test_argsort_missing(self, data_missing_for_sorting): result = pd.Series(data_missing_for_sorting).argsort() diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index a169a08fa0d51..d05819b4099a9 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -193,10 +193,9 @@ def test_searchsorted(self, data_for_sorting): if not data_for_sorting.ordered: raise pytest.skip(reason="searchsorted requires ordered data.") - def test_argsort_nan_last(self, data_missing_for_sorting): - # GH 21801 - # TODO: Categorical.argsort places NA values at the end - pass + @pytest.mark.xfail(reason="TODO") + def test_argsort_missing_array(self, data_missing_for_sorting): + super().test_argsort_missing_array(data_missing_for_sorting) class TestCasting(base.BaseCastingTests): From b1b96a768035de963e54a1ef209db433e8e8e53a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 08:44:32 -0500 Subject: [PATCH 04/10] API: break Categorical.argsort --- doc/source/whatsnew/v0.25.0.rst | 32 +++++++++++++++++++++- pandas/core/arrays/categorical.py | 20 ++++++++------ pandas/core/sorting.py | 14 ---------- pandas/tests/extension/test_categorical.py | 4 --- 4 files changed, 42 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a225c94ea0c75..4ef6921cc53ce 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -661,7 +661,7 @@ when both are :class:`Series` (:issue:`23293`). *Previous behavior* -.. code-block:: python +.. code-block:: ipython In [5]: np.power(s1, s2) Out[5]: @@ -684,6 +684,36 @@ applying the ufunc. np.power(s1, s2.array) +Categorical.argsort now places missing values at the end +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`Categorical.argsort` now places missing values at the end of the array, making it +consistent with NumPy and the rest of pandas. + +.. ipython:: python + + cat = pd.Categorical(['b', None, 'a'], categories=['a', 'b'], ordered=True) + +*Previous behavior* + +.. code-block:: ipython + + In [2]: cat = pd.Categorical(['b', None, 'a'], categories=['a', 'b'], ordered=True) + + In [3]: cat.argsort() + Out[3]: array([1, 2, 0]) + + In [4]: cat[cat.argsort()] + Out[4]: + [NaN, a, b] + categories (2, object): [a < b] + +*New behavior* + +.. ipython:: python + + cat.argsort() + cat[cat.argsort()] .. _whatsnew_0250.api_breaking.deps: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 35603e60ff8bc..e901c11cf3054 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1532,12 +1532,13 @@ def _values_for_argsort(self): return self._codes.copy() def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): - # TODO(PY2): use correct signature - # We have to do *args, **kwargs to avoid a a py2-only signature - # issue since np.argsort differs from argsort. """ Return the indices that would sort the Categorical. + .. versionchanged:: 0.25.0 + + Changed to sort missing values at the end. + Parameters ---------- ascending : bool, default True @@ -1574,13 +1575,14 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): ... ordered=True) >>> cat.argsort() array([3, 0, 1, 2]) + + Missing values are placed at the end + + >>> cat = pd.Categorical([2, None, 1]) + >>> cat.argsort() + array([2, 0, 1]) """ - ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - values = self._values_for_argsort() - result = np.argsort(values, kind=kind, **kwargs) - if not ascending: - result = result[::-1] - return result + return super().argsort(ascending=ascending, kind=kind, *args, **kwargs) def sort_values(self, inplace=False, ascending=True, na_position='last'): """ diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 750a4c903176f..b79390581612b 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -240,20 +240,6 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): items = extract_array(items) mask = np.asarray(isna(items)) - # specially handle Categorical - if is_categorical_dtype(items): - if na_position not in {'first', 'last'}: - raise ValueError('invalid na_position: {!r}'.format(na_position)) - - cnt_null = mask.sum() - sorted_idx = items.argsort(ascending=ascending, kind=kind) - if ascending and na_position == 'last': - # NaN is coded as -1 and is listed in front after sorting - sorted_idx = np.roll(sorted_idx, -cnt_null) - elif not ascending and na_position == 'first': - # NaN is coded as -1 and is listed in the end after sorting - sorted_idx = np.roll(sorted_idx, cnt_null) - return sorted_idx if is_extension_array_dtype(items): items = items._values_for_argsort() diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index d05819b4099a9..4cf9f78e1531d 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -193,10 +193,6 @@ def test_searchsorted(self, data_for_sorting): if not data_for_sorting.ordered: raise pytest.skip(reason="searchsorted requires ordered data.") - @pytest.mark.xfail(reason="TODO") - def test_argsort_missing_array(self, data_missing_for_sorting): - super().test_argsort_missing_array(data_missing_for_sorting) - class TestCasting(base.BaseCastingTests): pass From c5bfb44c913bf488c51082bf2c993185f94cbdd9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 08:46:35 -0500 Subject: [PATCH 05/10] issue --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 4ef6921cc53ce..5c22a3bcee227 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -688,7 +688,7 @@ Categorical.argsort now places missing values at the end ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :meth:`Categorical.argsort` now places missing values at the end of the array, making it -consistent with NumPy and the rest of pandas. +consistent with NumPy and the rest of pandas (:issue:`21801`). .. ipython:: python From c9868c1004187ad3e6406b13737d8d7039d5d59a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 11:47:09 -0500 Subject: [PATCH 06/10] update for break --- pandas/tests/frame/test_sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 96aeb608ba3b8..24f5193517e0a 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -629,7 +629,7 @@ def test_sort_index_na_position_with_categories(self): reversed_categories = sorted(categories, reverse=True) reversed_category_indices = sorted(category_indices, reverse=True) - reversed_na_indices = sorted(na_indices, reverse=True) + reversed_na_indices = sorted(na_indices) df = pd.DataFrame({ column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], From 7bedc4bd228d8afd035445a44cc9e6de3eda4c11 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 12:41:06 -0500 Subject: [PATCH 07/10] update for break --- pandas/tests/frame/test_sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 24f5193517e0a..11de77f6779e6 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -539,7 +539,7 @@ def test_sort_index_categorical_index(self): assert_frame_equal(result, expected) result = df.sort_index(ascending=False) - expected = df.iloc[[3, 2, 5, 1, 0, 4]] + expected = df.iloc[[2, 3, 0, 1, 5, 4]] assert_frame_equal(result, expected) def test_sort_index(self): From c917575fcc9f0dbf94f8e09566149657a2567e97 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 13:18:18 -0500 Subject: [PATCH 08/10] trigger CI From bd2cbfd6c987141ef37c54844d162c6e5ae4ef1c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 13:50:27 -0500 Subject: [PATCH 09/10] 32-bit compat --- pandas/tests/extension/base/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 80f824775ed83..0e6d4892f4239 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -49,7 +49,7 @@ def test_argsort(self, data_for_sorting): def test_argsort_missing_array(self, data_missing_for_sorting): result = data_missing_for_sorting.argsort() - expected = np.array([2, 0, 1]) + expected = np.array([2, 0, 1], dtype=np.dtype("int")) tm.assert_numpy_array_equal(result, expected) def test_argsort_missing(self, data_missing_for_sorting): From 47c6b57bb7d2b7959f5f96db6aeebe803d2dc1b0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 14:40:41 -0500 Subject: [PATCH 10/10] 32-bit compat --- pandas/tests/extension/base/methods.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 0e6d4892f4239..9b154a8afeabc 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -50,6 +50,9 @@ def test_argsort(self, data_for_sorting): def test_argsort_missing_array(self, data_missing_for_sorting): result = data_missing_for_sorting.argsort() expected = np.array([2, 0, 1], dtype=np.dtype("int")) + # we don't care whether it's int32 or int64 + result = result.astype("int64", casting="safe") + expected = expected.astype("int64", casting="safe") tm.assert_numpy_array_equal(result, expected) def test_argsort_missing(self, data_missing_for_sorting):