diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index fd47ca14dc788..18dcaf1359fe3 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -459,6 +459,7 @@ Other API Changes - Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) - The ``.str``-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) - Removed support of gtk package for clipboards (:issue:`26563`) +- :meth:`ExtensionArray.argsort` places NA values at the end of the sorted array, except :class:`Categorical` (:issue:`21801`) .. _whatsnew_0250.deprecations: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c709cd9e9f0b2..088ce9e380174 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -23,6 +23,7 @@ from pandas._typing import ArrayLike from pandas.core import ops +from pandas.core.sorting import nargsort _not_implemented_message = "{} does not implement {}." @@ -403,13 +404,13 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ # Implementor note: You have two places to override the behavior of # argsort. - # 1. _values_for_argsort : construct the values passed to np.argsort + # 1. _values_for_argsort : construct the values passed to np.argsort. # 2. argsort : total control over sorting. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) + values = self._values_for_argsort() - result = np.argsort(values, kind=kind, **kwargs) - if not ascending: - result = result[::-1] + result = nargsort(values, kind=kind, ascending=ascending, + na_position='last') return result def fillna(self, value=None, method=None, limit=None): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c079b860bb924..c776031526c48 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1523,7 +1523,7 @@ def check_for_ordered(self, op): def _values_for_argsort(self): return self._codes.copy() - def argsort(self, *args, **kwargs): + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): # TODO(PY2): use correct signature # We have to do *args, **kwargs to avoid a a py2-only signature # issue since np.argsort differs from argsort. @@ -1567,8 +1567,12 @@ def argsort(self, *args, **kwargs): >>> cat.argsort() array([3, 0, 1, 2]) """ - # Keep the implementation here just for the docstring. - return super().argsort(*args, **kwargs) + ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) + values = self._values_for_argsort() + result = np.argsort(values, kind=kind, **kwargs) + if not ascending: + result = result[::-1] + return result def sort_values(self, inplace=False, ascending=True, na_position='last'): """ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 29337b7f76131..1dbc8cde595e5 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -501,23 +501,6 @@ def value_counts(self, dropna=True): return Series(array, index=index) - def _values_for_argsort(self) -> np.ndarray: - """Return values for sorting. - - Returns - ------- - ndarray - The transformed values should maintain the ordering between values - within the array. - - See Also - -------- - ExtensionArray.argsort - """ - data = self._data.copy() - data[self._mask] = data.min() - 1 - return data - @classmethod def _create_comparison_method(cls, op): def cmp_method(self, other): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index a6e282462af68..d22fea29843d4 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -676,7 +676,7 @@ def _check_timedeltalike_freq_compat(self, other): _raise_on_incompatible(self, other) def _values_for_argsort(self): - return self._data + return self PeriodArray._add_comparison_ops() diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 21c0c8f747b10..4478166626661 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -255,15 +255,17 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): sorted_idx = np.roll(sorted_idx, cnt_null) return sorted_idx - with warnings.catch_warnings(): - # https://github.com/pandas-dev/pandas/issues/25439 - # can be removed once ExtensionArrays are properly handled by nargsort - warnings.filterwarnings( - "ignore", category=FutureWarning, - message="Converting timezone-aware DatetimeArray to") - items = np.asanyarray(items) - idx = np.arange(len(items)) mask = isna(items) + if not isinstance(items, np.ndarray): + with warnings.catch_warnings(): + # https://github.com/pandas-dev/pandas/issues/25439 + # can be removed once ExtensionArrays are properly handled by + # nargsort + warnings.filterwarnings( + "ignore", category=FutureWarning, + message="Converting timezone-aware DatetimeArray to") + items = np.asanyarray(items) + idx = np.arange(len(items)) non_nans = items[~mask] non_nan_idx = idx[~mask] nan_idx = np.nonzero(mask)[0] diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 1852edaa9e748..50fd4cd292641 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -44,6 +44,11 @@ def test_argsort_missing(self, data_missing_for_sorting): expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) self.assert_series_equal(result, expected) + def test_argsort_nan_last(self, data_missing_for_sorting): + # GH 21801 + result = data_missing_for_sorting.argsort() + assert result[-1] == 1 + @pytest.mark.parametrize('ascending', [True, False]) def test_sort_values(self, data_for_sorting, ascending): ser = pd.Series(data_for_sorting) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 1823eeb4d7fc0..ef0e44ec3c01c 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -153,6 +153,12 @@ def _reduce(self, name, skipna=True, **kwargs): "the {} operation".format(name)) return op(axis=0) + def _values_for_argsort(self): + data = self._data + mask = self.isna() + data[mask] = decimal.Decimal('Infinity') + return data + def to_decimal(values, context=None): return DecimalArray([decimal.Decimal(x) for x in values], context=context) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 4b93f0e12e32a..12713a8565228 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -182,7 +182,10 @@ def _values_for_argsort(self): # If all the elemnts of self are the same size P, NumPy will # cast them to an (N, P) array, instead of an (N,) array of tuples. frozen = [()] + [tuple(x.items()) for x in self] - return np.array(frozen, dtype=object)[1:] + data = np.array(frozen, dtype=object)[1:] + mask = self.isna() + data[mask] = np.nan + return data def make_data(): diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 4cf9f78e1531d..a169a08fa0d51 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -193,6 +193,11 @@ def test_searchsorted(self, data_for_sorting): if not data_for_sorting.ordered: raise pytest.skip(reason="searchsorted requires ordered data.") + def test_argsort_nan_last(self, data_missing_for_sorting): + # GH 21801 + # TODO: Categorical.argsort places NA values at the end + pass + class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 75fa37eb9af09..fb23c8822a000 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -7,7 +7,8 @@ import pytest from pandas import ( - DataFrame, MultiIndex, Series, array, concat, merge, to_datetime) + Categorical, DataFrame, MultiIndex, Series, array, concat, merge, + to_datetime) from pandas.core import common as com from pandas.core.sorting import ( decons_group_index, get_group_index, is_int64_overflow_possible, @@ -188,6 +189,14 @@ def test_nargsort_datetimearray_warning(self): with tm.assert_produces_warning(None): nargsort(data) + @pytest.mark.parametrize('data', [ + Categorical(['a', 'c', 'a', 'b']), + to_datetime([0, 2, 0, 1]).tz_localize('Europe/Brussels')]) + def test_nargsort_extension_array(self, data): + result = nargsort(data) + expected = np.array([0, 2, 3, 1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + class TestMerge: