From c22c01b2e6884b0da2bab86d6c3aed4735414fdf Mon Sep 17 00:00:00 2001 From: makbigc Date: Sat, 11 May 2019 22:33:06 +0800 Subject: [PATCH 01/22] Add mask as return value of _values_for_argsort --- pandas/core/arrays/base.py | 29 ++++++++++++++++--- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/integer.py | 6 ++-- pandas/core/arrays/numpy_.py | 2 +- pandas/core/arrays/period.py | 2 +- pandas/tests/extension/base/methods.py | 5 ++++ .../tests/extension/decimal/test_decimal.py | 6 ++++ pandas/tests/extension/json/array.py | 4 +-- pandas/tests/extension/json/test_json.py | 6 ++++ pandas/tests/extension/test_categorical.py | 6 ++++ pandas/tests/extension/test_datetime.py | 9 ++++++ pandas/tests/extension/test_integer.py | 6 ++++ pandas/tests/extension/test_interval.py | 7 +++++ pandas/tests/extension/test_numpy.py | 17 +++++++++++ pandas/tests/extension/test_period.py | 6 ++++ pandas/tests/extension/test_sparse.py | 6 ++++ 17 files changed, 109 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c709cd9e9f0b2..c0e88bf6cfc1c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -361,7 +361,7 @@ def isna(self) -> ArrayLike: """ raise AbstractMethodError(self) - def _values_for_argsort(self) -> np.ndarray: + def _values_for_argsort(self) -> Tuple[np.ndarray, np.ndarray]: """ Return values for sorting. @@ -376,7 +376,7 @@ def _values_for_argsort(self) -> np.ndarray: ExtensionArray.argsort """ # Note: this is used in `ExtensionArray.argsort`. - return np.array(self) + return np.array(self), self.isna() def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ @@ -406,8 +406,29 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): # 1. _values_for_argsort : construct the values passed to np.argsort # 2. argsort : total control over sorting. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - values = self._values_for_argsort() - result = np.argsort(values, kind=kind, **kwargs) + values, mask = self._values_for_argsort() + + def permutation(mask): + # Return a permutation which maps the indices of the + # subarray without nan to the indices of the original array. + permu = np.arange(len(mask)) + nan_loc = np.arange(len(mask))[mask] + offset = 0 + for x in nan_loc: + permu[x - offset:] += 1 + offset += 1 + return permu + + if mask.any(): + notmask = ~mask + notnull = np.argsort(values[notmask], kind=kind, **kwargs) + permu = permutation(mask) + notnull = permu[notnull] + allnan = np.arange(len(self))[mask] + result = np.append(notnull, allnan) + else: + result = np.argsort(values, kind=kind, **kwargs) + if not ascending: result = result[::-1] return result diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 49dd0041854bc..c4ce43b55741b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1521,7 +1521,7 @@ def check_for_ordered(self, op): "Categorical to an ordered one\n".format(op=op)) def _values_for_argsort(self): - return self._codes.copy() + return self._codes.copy(), self.isna() def argsort(self, *args, **kwargs): # TODO(PY2): use correct signature diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c32f8642dc2ed..a1696c3fd52da 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -621,7 +621,7 @@ def _from_factorized(cls, values, original): return cls(values, dtype=original.dtype) def _values_for_argsort(self): - return self._data + return self._data, self._isnan # ------------------------------------------------------------------ # Additional array methods diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 42aa6a055acca..78dda4790a067 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -512,7 +512,8 @@ def value_counts(self, dropna=True): return Series(array, index=index) - def _values_for_argsort(self) -> np.ndarray: + #def _values_for_argsort(self) -> np.ndarray: + def _values_for_argsort(self): """Return values for sorting. Returns @@ -526,8 +527,9 @@ def _values_for_argsort(self) -> np.ndarray: ExtensionArray.argsort """ data = self._data.copy() + mask = self._mask data[self._mask] = data.min() - 1 - return data + return data, mask @classmethod def _create_comparison_method(cls, op): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 0deefd2b10b6e..ab71d45bdbaf0 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -281,7 +281,7 @@ def copy(self, deep=False): return type(self)(self._ndarray.copy()) def _values_for_argsort(self): - return self._ndarray + return self._ndarray, self.isna() def _values_for_factorize(self): return self._ndarray, -1 diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8a6640e11ad74..8f1da6dbe5c4a 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -674,7 +674,7 @@ def _check_timedeltalike_freq_compat(self, other): _raise_on_incompatible(self, other) def _values_for_argsort(self): - return self._data + return self._data, self._isnan PeriodArray._add_comparison_ops() diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 1852edaa9e748..be7346e66ed4c 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -44,6 +44,11 @@ def test_argsort_missing(self, data_missing_for_sorting): expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) self.assert_series_equal(result, expected) + def test_argsort_nan_loc(self, data_multiple_nan): + result = data_multiple_nan.argsort() + expected = np.array([3, 9, 7, 1, 0, 6, 2, 4, 5, 8]) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize('ascending', [True, False]) def test_sort_values(self, data_for_sorting, ascending): ser = pd.Series(data_for_sorting) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 97fae41bcc720..ba68a8edc947c 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -46,6 +46,12 @@ def data_missing_for_sorting(): decimal.Decimal('0')]) +@pytest.fixture +def data_multiple_nan(): + return DecimalArray([decimal.Decimal(x) for x in + [5, 4, np.nan, 1, np.nan, np.nan, 6, 3, np.nan, 2]]) + + @pytest.fixture def na_cmp(): return lambda x, y: x.is_nan() and y.is_nan() diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 4b93f0e12e32a..dde88a4838e6a 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -171,7 +171,7 @@ def _concat_same_type(cls, to_concat): return cls(data) def _values_for_factorize(self): - frozen = self._values_for_argsort() + frozen, _ = self._values_for_argsort() if len(frozen) == 0: # _factorize_array expects 1-d array, this is a len-0 2-d array. frozen = frozen.ravel() @@ -182,7 +182,7 @@ def _values_for_argsort(self): # If all the elemnts of self are the same size P, NumPy will # cast them to an (N, P) array, instead of an (N,) array of tuples. frozen = [()] + [tuple(x.items()) for x in self] - return np.array(frozen, dtype=object)[1:] + return np.array(frozen, dtype=object)[1:], self.isna() def make_data(): diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 97c329e0a5c92..47b7fd670b1da 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -50,6 +50,12 @@ def data_missing_for_sorting(): return JSONArray([{'b': 1}, {}, {'a': 4}]) +@pytest.fixture +def data_multiple_nan(): + return JSONArray([{'e': 5}, {'d': 4}, {}, {'a': 1}, {}, + {}, {'f': 6}, {'c': 3}, {}, {'b': 2}]) + + @pytest.fixture def na_value(dtype): return dtype.na_value diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 4cf9f78e1531d..6105db64d8f19 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -68,6 +68,12 @@ def data_missing_for_sorting(): ordered=True) +@pytest.fixture +def data_multiple_nan(): + return Categorical(['E', 'D', None, 'A', None, + None, 'F', 'C', None, 'B'], ordered=True) + + @pytest.fixture def na_value(): return np.nan diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index baee04c3b79eb..600410e8864cb 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -45,6 +45,15 @@ def data_missing_for_sorting(dtype): dtype=dtype) +@pytest.fixture +def data_multiple_nan(dtype): + idx = pd.date_range(start='2000-01-01', end='2000-01-06') + return DatetimeArray(np.array([idx[4], idx[3], 'NaT', idx[0], 'NaT', + 'NaT', idx[5], idx[2], 'NaT', idx[1]], + dtype='datetime64[ns]'), + dtype=dtype) + + @pytest.fixture def data_for_grouping(dtype): """ diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 22bb086a919ca..09295138c74ca 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -57,6 +57,12 @@ def data_for_sorting(dtype): return integer_array([1, 2, 0], dtype=dtype) +@pytest.fixture +def data_multiple_nan(dtype): + return integer_array([5, 4, np.nan, 1, np.nan, + np.nan, 6, 3, np.nan, 2], dtype=dtype) + + @pytest.fixture def data_missing_for_sorting(dtype): return integer_array([1, np.nan, 0], dtype=dtype) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index f1f90b298ffe2..ff322c7db0ed3 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -57,6 +57,13 @@ def data_missing_for_sorting(): return IntervalArray.from_tuples([(1, 2), None, (0, 1)]) +@pytest.fixture +def data_multiple_nan(): + return IntervalArray.from_tuples([(5, 6), (4, 5), None, (1, 2), + None, None, (6, 7), (3, 4), + None, (2, 3)]) + + @pytest.fixture def na_value(): return np.nan diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index f31fa5b87cfe5..8b632f0c1d184 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -101,6 +101,23 @@ def data_missing_for_sorting(allow_in_pandas, dtype): ) +@pytest.fixture +def data_multiple_nan(allow_in_pandas, dtype): + """Length-10 array with a known sort order. + + This should be three items [B, NA, A] with + A < B and NA missing. + """ + if dtype.numpy_dtype == 'object': + return PandasArray( + np.array([(5,), (4,), np.nan, (1,), np.nan, + np.nan, (6,), (3,), np.nan, (2,)]) + ) + return PandasArray( + np.array([5, 4, np.nan, 1, np.nan, np.nan, 6, 3, np.nan, 2]) + ) + + @pytest.fixture def data_for_grouping(allow_in_pandas, dtype): """Data for factorization, grouping, and unique tests. diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index b988dcb211dd0..b289e318b9050 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -40,6 +40,12 @@ def data_missing_for_sorting(dtype): return PeriodArray([2018, iNaT, 2017], freq=dtype.freq) +@pytest.fixture +def data_multiple_nan(dtype): + return PeriodArray([2005, 2004, iNaT, 2001, iNaT, iNaT, + 2006, 2003, iNaT, 2002], freq=dtype.freq) + + @pytest.fixture def data_for_grouping(dtype): B = 2018 diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index faf1905ea1763..df1b87868c28d 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -65,6 +65,12 @@ def data_missing_for_sorting(request): return SparseArray([2, np.nan, 1], fill_value=request.param) +@pytest.fixture(params=[0, np.nan]) +def data_multiple_nan(request): + return SparseArray([5, 4, np.nan, 1, np.nan, + np.nan, 6, 3, np.nan, 2], fill_value=request.param) + + @pytest.fixture def na_value(): return np.nan From 93ac5899cad7c96b1db419dbd3c55d7b55f49c11 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 12 May 2019 13:44:47 +0800 Subject: [PATCH 02/22] Fix mypy typing and add annotation to the test --- pandas/core/arrays/integer.py | 5 ++--- pandas/tests/extension/base/methods.py | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 78dda4790a067..3ad4e5670cb0c 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,6 +1,6 @@ import copy import sys -from typing import Type +from typing import Tuple, Type import warnings import numpy as np @@ -512,8 +512,7 @@ def value_counts(self, dropna=True): return Series(array, index=index) - #def _values_for_argsort(self) -> np.ndarray: - def _values_for_argsort(self): + def _values_for_argsort(self) -> Tuple[np.ndarray, np.ndarray]: """Return values for sorting. Returns diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index be7346e66ed4c..60aa8b9aa18e1 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -45,6 +45,7 @@ def test_argsort_missing(self, data_missing_for_sorting): self.assert_series_equal(result, expected) def test_argsort_nan_loc(self, data_multiple_nan): + # GH 21801 result = data_multiple_nan.argsort() expected = np.array([3, 9, 7, 1, 0, 6, 2, 4, 5, 8]) tm.assert_numpy_array_equal(result, expected) From b401cba0e9a9448816681f58625e8eecd152677f Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 12 May 2019 13:59:47 +0800 Subject: [PATCH 03/22] Add whatsnew note --- doc/source/whatsnew/v0.25.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1fb9b5ae695a0..969eef201910f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -82,7 +82,8 @@ Other Enhancements - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) - :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`) - :meth:`pandas.core.window.Rolling` supports exponential (or Poisson) window type (:issue:`21303`) -- +- :meth:`ExtensionArray.argsort` places NA values at the end of the sorted array (:issue:`21801`) +>>>>>>> Add whatsnew note .. _whatsnew_0250.api_breaking: From aa50eed89ca3a178bbdda7eb42ae2652d451073a Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 12 May 2019 16:06:04 +0800 Subject: [PATCH 04/22] Modify nargsort --- pandas/core/sorting.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 21c0c8f747b10..a3631576978a1 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -247,12 +247,11 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): mask = isna(items) cnt_null = mask.sum() sorted_idx = items.argsort(ascending=ascending, kind=kind) - if ascending and na_position == 'last': - # NaN is coded as -1 and is listed in front after sorting - sorted_idx = np.roll(sorted_idx, -cnt_null) - elif not ascending and na_position == 'first': - # NaN is coded as -1 and is listed in the end after sorting + + if ascending and na_position == 'first': sorted_idx = np.roll(sorted_idx, cnt_null) + elif not ascending and na_position == 'last': + sorted_idx = np.roll(sorted_idx, -cnt_null) return sorted_idx with warnings.catch_warnings(): From d38a5e1ecf8273854380a922e6988ecef84d2e21 Mon Sep 17 00:00:00 2001 From: makbigc Date: Thu, 16 May 2019 19:42:51 +0800 Subject: [PATCH 05/22] Simplify the subfunction permutation --- pandas/core/arrays/base.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c0e88bf6cfc1c..a1385f74adc61 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -412,11 +412,7 @@ def permutation(mask): # Return a permutation which maps the indices of the # subarray without nan to the indices of the original array. permu = np.arange(len(mask)) - nan_loc = np.arange(len(mask))[mask] - offset = 0 - for x in nan_loc: - permu[x - offset:] += 1 - offset += 1 + permu = permu[~mask] return permu if mask.any(): From bae30fd7e8ba2363996b5823269d39e9b6443c0a Mon Sep 17 00:00:00 2001 From: makbigc Date: Thu, 16 May 2019 20:01:06 +0800 Subject: [PATCH 06/22] Remove test_argsort_nan_loc --- pandas/tests/extension/test_categorical.py | 6 ------ pandas/tests/extension/test_datetime.py | 9 --------- pandas/tests/extension/test_integer.py | 6 ------ pandas/tests/extension/test_interval.py | 7 ------- pandas/tests/extension/test_numpy.py | 17 ----------------- pandas/tests/extension/test_period.py | 6 ------ pandas/tests/extension/test_sparse.py | 6 ------ 7 files changed, 57 deletions(-) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 6105db64d8f19..4cf9f78e1531d 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -68,12 +68,6 @@ def data_missing_for_sorting(): ordered=True) -@pytest.fixture -def data_multiple_nan(): - return Categorical(['E', 'D', None, 'A', None, - None, 'F', 'C', None, 'B'], ordered=True) - - @pytest.fixture def na_value(): return np.nan diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 600410e8864cb..baee04c3b79eb 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -45,15 +45,6 @@ def data_missing_for_sorting(dtype): dtype=dtype) -@pytest.fixture -def data_multiple_nan(dtype): - idx = pd.date_range(start='2000-01-01', end='2000-01-06') - return DatetimeArray(np.array([idx[4], idx[3], 'NaT', idx[0], 'NaT', - 'NaT', idx[5], idx[2], 'NaT', idx[1]], - dtype='datetime64[ns]'), - dtype=dtype) - - @pytest.fixture def data_for_grouping(dtype): """ diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 09295138c74ca..22bb086a919ca 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -57,12 +57,6 @@ def data_for_sorting(dtype): return integer_array([1, 2, 0], dtype=dtype) -@pytest.fixture -def data_multiple_nan(dtype): - return integer_array([5, 4, np.nan, 1, np.nan, - np.nan, 6, 3, np.nan, 2], dtype=dtype) - - @pytest.fixture def data_missing_for_sorting(dtype): return integer_array([1, np.nan, 0], dtype=dtype) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index ff322c7db0ed3..f1f90b298ffe2 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -57,13 +57,6 @@ def data_missing_for_sorting(): return IntervalArray.from_tuples([(1, 2), None, (0, 1)]) -@pytest.fixture -def data_multiple_nan(): - return IntervalArray.from_tuples([(5, 6), (4, 5), None, (1, 2), - None, None, (6, 7), (3, 4), - None, (2, 3)]) - - @pytest.fixture def na_value(): return np.nan diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 8b632f0c1d184..f31fa5b87cfe5 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -101,23 +101,6 @@ def data_missing_for_sorting(allow_in_pandas, dtype): ) -@pytest.fixture -def data_multiple_nan(allow_in_pandas, dtype): - """Length-10 array with a known sort order. - - This should be three items [B, NA, A] with - A < B and NA missing. - """ - if dtype.numpy_dtype == 'object': - return PandasArray( - np.array([(5,), (4,), np.nan, (1,), np.nan, - np.nan, (6,), (3,), np.nan, (2,)]) - ) - return PandasArray( - np.array([5, 4, np.nan, 1, np.nan, np.nan, 6, 3, np.nan, 2]) - ) - - @pytest.fixture def data_for_grouping(allow_in_pandas, dtype): """Data for factorization, grouping, and unique tests. diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index b289e318b9050..b988dcb211dd0 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -40,12 +40,6 @@ def data_missing_for_sorting(dtype): return PeriodArray([2018, iNaT, 2017], freq=dtype.freq) -@pytest.fixture -def data_multiple_nan(dtype): - return PeriodArray([2005, 2004, iNaT, 2001, iNaT, iNaT, - 2006, 2003, iNaT, 2002], freq=dtype.freq) - - @pytest.fixture def data_for_grouping(dtype): B = 2018 diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index df1b87868c28d..faf1905ea1763 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -65,12 +65,6 @@ def data_missing_for_sorting(request): return SparseArray([2, np.nan, 1], fill_value=request.param) -@pytest.fixture(params=[0, np.nan]) -def data_multiple_nan(request): - return SparseArray([5, 4, np.nan, 1, np.nan, - np.nan, 6, 3, np.nan, 2], fill_value=request.param) - - @pytest.fixture def na_value(): return np.nan From effe18c4df62e36710ba23b3282f4fbc89d9d98a Mon Sep 17 00:00:00 2001 From: makbigc Date: Thu, 16 May 2019 20:09:51 +0800 Subject: [PATCH 07/22] Change the docstring of _values_for_argsort --- pandas/core/arrays/base.py | 2 ++ pandas/core/arrays/integer.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index a1385f74adc61..2a884a65d0fe6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -370,6 +370,8 @@ def _values_for_argsort(self) -> Tuple[np.ndarray, np.ndarray]: ndarray The transformed values should maintain the ordering between values within the array. + ndarray + The mask which indicates the NaN values. See Also -------- diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 3ad4e5670cb0c..8395b800ed0fa 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -520,6 +520,8 @@ def _values_for_argsort(self) -> Tuple[np.ndarray, np.ndarray]: ndarray The transformed values should maintain the ordering between values within the array. + ndarray + The mask which indicates the NaN values. See Also -------- From d1dd5fa272ea05a7520b5be9fe6b52ec8827614d Mon Sep 17 00:00:00 2001 From: makbigc Date: Thu, 16 May 2019 20:17:44 +0800 Subject: [PATCH 08/22] Remove the code related to test_argsort_nan_loc --- pandas/tests/extension/base/methods.py | 6 ------ pandas/tests/extension/decimal/test_decimal.py | 6 ------ pandas/tests/extension/json/test_json.py | 6 ------ 3 files changed, 18 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 60aa8b9aa18e1..1852edaa9e748 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -44,12 +44,6 @@ def test_argsort_missing(self, data_missing_for_sorting): expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) self.assert_series_equal(result, expected) - def test_argsort_nan_loc(self, data_multiple_nan): - # GH 21801 - result = data_multiple_nan.argsort() - expected = np.array([3, 9, 7, 1, 0, 6, 2, 4, 5, 8]) - tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('ascending', [True, False]) def test_sort_values(self, data_for_sorting, ascending): ser = pd.Series(data_for_sorting) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index ba68a8edc947c..97fae41bcc720 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -46,12 +46,6 @@ def data_missing_for_sorting(): decimal.Decimal('0')]) -@pytest.fixture -def data_multiple_nan(): - return DecimalArray([decimal.Decimal(x) for x in - [5, 4, np.nan, 1, np.nan, np.nan, 6, 3, np.nan, 2]]) - - @pytest.fixture def na_cmp(): return lambda x, y: x.is_nan() and y.is_nan() diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 47b7fd670b1da..97c329e0a5c92 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -50,12 +50,6 @@ def data_missing_for_sorting(): return JSONArray([{'b': 1}, {}, {'a': 4}]) -@pytest.fixture -def data_multiple_nan(): - return JSONArray([{'e': 5}, {'d': 4}, {}, {'a': 1}, {}, - {}, {'f': 6}, {'c': 3}, {}, {'b': 2}]) - - @pytest.fixture def na_value(dtype): return dtype.na_value From bf1e927cf318d8af10e0c6e30794a163e585d872 Mon Sep 17 00:00:00 2001 From: makbigc Date: Tue, 21 May 2019 12:46:02 +0800 Subject: [PATCH 09/22] Change docstring and move the entry in whatsnew --- doc/source/whatsnew/v0.25.0.rst | 6 +++++- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/integer.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 969eef201910f..8f5072a2db3ea 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -82,8 +82,8 @@ Other Enhancements - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) - :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`) - :meth:`pandas.core.window.Rolling` supports exponential (or Poisson) window type (:issue:`21303`) +<<<<<<< HEAD - :meth:`ExtensionArray.argsort` places NA values at the end of the sorted array (:issue:`21801`) ->>>>>>> Add whatsnew note .. _whatsnew_0250.api_breaking: @@ -435,10 +435,14 @@ Other API Changes - Comparing :class:`Timestamp` with unsupported objects now returns :py:obj:`NotImplemented` instead of raising ``TypeError``. This implies that unsupported rich comparisons are delegated to the other object, and are now consistent with Python 3 behavior for ``datetime`` objects (:issue:`24011`) - Bug in :meth:`DatetimeIndex.snap` which didn't preserving the ``name`` of the input :class:`Index` (:issue:`25575`) - The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) +<<<<<<< HEAD - The ``arg`` argument in :meth:`pandas.core.window._Window.aggregate` has been renamed to ``func`` (:issue:`26372`) - Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) - The ``.str``-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) - Removed support of gtk package for clipboards (:issue:`26563`) +======= +- :meth:`ExtensionArray.argsort` places the nan at the end of the sorted array (:issue:`21801`) +>>>>>>> Change docstring and move the entry in whatsnew .. _whatsnew_0250.deprecations: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 2a884a65d0fe6..25626985e9b83 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -371,7 +371,7 @@ def _values_for_argsort(self) -> Tuple[np.ndarray, np.ndarray]: The transformed values should maintain the ordering between values within the array. ndarray - The mask which indicates the NaN values. + The mask which indicates the NA values. See Also -------- diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 8395b800ed0fa..ac1c99b05efcc 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -521,7 +521,7 @@ def _values_for_argsort(self) -> Tuple[np.ndarray, np.ndarray]: The transformed values should maintain the ordering between values within the array. ndarray - The mask which indicates the NaN values. + The mask which indicates the NA values. See Also -------- From 19254233f3ae6d8d3f2e3cffa9658b4724a8d446 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 26 May 2019 21:36:51 +0800 Subject: [PATCH 10/22] amend after review --- doc/source/whatsnew/v0.25.0.rst | 4 ++++ pandas/core/arrays/base.py | 16 ++++++++-------- pandas/core/arrays/integer.py | 2 ++ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 8f5072a2db3ea..47150e3e1d6a2 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -438,11 +438,15 @@ Other API Changes <<<<<<< HEAD - The ``arg`` argument in :meth:`pandas.core.window._Window.aggregate` has been renamed to ``func`` (:issue:`26372`) - Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) +<<<<<<< HEAD - The ``.str``-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) - Removed support of gtk package for clipboards (:issue:`26563`) ======= - :meth:`ExtensionArray.argsort` places the nan at the end of the sorted array (:issue:`21801`) >>>>>>> Change docstring and move the entry in whatsnew +======= +- :meth:`ExtensionArray.argsort` places the ``nan`` at the end of the sorted array (:issue:`21801`) +>>>>>>> amend after review .. _whatsnew_0250.deprecations: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 25626985e9b83..66affb187a13a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -373,6 +373,8 @@ def _values_for_argsort(self) -> Tuple[np.ndarray, np.ndarray]: ndarray The mask which indicates the NA values. + .. versionadded:: 0.25.0 + See Also -------- ExtensionArray.argsort @@ -410,17 +412,15 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) values, mask = self._values_for_argsort() - def permutation(mask): - # Return a permutation which maps the indices of the - # subarray without nan to the indices of the original array. - permu = np.arange(len(mask)) - permu = permu[~mask] - return permu - if mask.any(): notmask = ~mask notnull = np.argsort(values[notmask], kind=kind, **kwargs) - permu = permutation(mask) + + # permu maps the indices of the subarray + # without nan to the indices of the original array. + permu = np.arange(len(mask)) + permu = permu[~mask] + notnull = permu[notnull] allnan = np.arange(len(self))[mask] result = np.append(notnull, allnan) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index ac1c99b05efcc..02e16840b95bf 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -523,6 +523,8 @@ def _values_for_argsort(self) -> Tuple[np.ndarray, np.ndarray]: ndarray The mask which indicates the NA values. + .. versionadded:: 0.25.0 + See Also -------- ExtensionArray.argsort From c57dff7d3063a5a3db877871621b6cc824d27514 Mon Sep 17 00:00:00 2001 From: makbigc Date: Thu, 6 Jun 2019 14:51:59 +0800 Subject: [PATCH 11/22] Use sorting.nargsort and remove some _values_for_argsort --- pandas/core/arrays/base.py | 45 +++++----------------------- pandas/core/arrays/categorical.py | 12 +++++--- pandas/core/arrays/datetimelike.py | 3 -- pandas/core/arrays/integer.py | 22 -------------- pandas/core/arrays/numpy_.py | 3 -- pandas/core/arrays/period.py | 3 -- pandas/core/sorting.py | 9 +++--- pandas/tests/extension/json/array.py | 4 +-- 8 files changed, 22 insertions(+), 79 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 66affb187a13a..61ed936ace2a2 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -23,6 +23,7 @@ from pandas._typing import ArrayLike from pandas.core import ops +from pandas.core.sorting import nargsort _not_implemented_message = "{} does not implement {}." @@ -361,27 +362,6 @@ def isna(self) -> ArrayLike: """ raise AbstractMethodError(self) - def _values_for_argsort(self) -> Tuple[np.ndarray, np.ndarray]: - """ - Return values for sorting. - - Returns - ------- - ndarray - The transformed values should maintain the ordering between values - within the array. - ndarray - The mask which indicates the NA values. - - .. versionadded:: 0.25.0 - - See Also - -------- - ExtensionArray.argsort - """ - # Note: this is used in `ExtensionArray.argsort`. - return np.array(self), self.isna() - def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ Return the indices that would sort this array. @@ -410,25 +390,14 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): # 1. _values_for_argsort : construct the values passed to np.argsort # 2. argsort : total control over sorting. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - values, mask = self._values_for_argsort() - if mask.any(): - notmask = ~mask - notnull = np.argsort(values[notmask], kind=kind, **kwargs) - - # permu maps the indices of the subarray - # without nan to the indices of the original array. - permu = np.arange(len(mask)) - permu = permu[~mask] - - notnull = permu[notnull] - allnan = np.arange(len(self))[mask] - result = np.append(notnull, allnan) + if hasattr(self, '_values_for_argsort'): + values = self._values_for_argsort() else: - result = np.argsort(values, kind=kind, **kwargs) - - if not ascending: - result = result[::-1] + values = self + na_position = 'last' if ascending else 'first' + result = nargsort(values, kind=kind, ascending=ascending, + na_position=na_position) return result def fillna(self, value=None, method=None, limit=None): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c4ce43b55741b..4785be000bcb6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1521,9 +1521,9 @@ def check_for_ordered(self, op): "Categorical to an ordered one\n".format(op=op)) def _values_for_argsort(self): - return self._codes.copy(), self.isna() + return self._codes.copy() - def argsort(self, *args, **kwargs): + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): # TODO(PY2): use correct signature # We have to do *args, **kwargs to avoid a a py2-only signature # issue since np.argsort differs from argsort. @@ -1567,8 +1567,12 @@ def argsort(self, *args, **kwargs): >>> cat.argsort() array([3, 0, 1, 2]) """ - # Keep the implementation here just for the docstring. - return super().argsort(*args, **kwargs) + ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) + values = self._values_for_argsort() + result = np.argsort(values, kind=kind, **kwargs) + if not ascending: + result = result[::-1] + return result def sort_values(self, inplace=False, ascending=True, na_position='last'): """ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a1696c3fd52da..e1a7eb625a5c7 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -620,9 +620,6 @@ def _values_for_factorize(self): def _from_factorized(cls, values, original): return cls(values, dtype=original.dtype) - def _values_for_argsort(self): - return self._data, self._isnan - # ------------------------------------------------------------------ # Additional array methods # These are not part of the EA API, but we implement them because diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 02e16840b95bf..3b856323348e6 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -512,28 +512,6 @@ def value_counts(self, dropna=True): return Series(array, index=index) - def _values_for_argsort(self) -> Tuple[np.ndarray, np.ndarray]: - """Return values for sorting. - - Returns - ------- - ndarray - The transformed values should maintain the ordering between values - within the array. - ndarray - The mask which indicates the NA values. - - .. versionadded:: 0.25.0 - - See Also - -------- - ExtensionArray.argsort - """ - data = self._data.copy() - mask = self._mask - data[self._mask] = data.min() - 1 - return data, mask - @classmethod def _create_comparison_method(cls, op): def cmp_method(self, other): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index ab71d45bdbaf0..92cf7e9425cd3 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -280,9 +280,6 @@ def take(self, indices, allow_fill=False, fill_value=None): def copy(self, deep=False): return type(self)(self._ndarray.copy()) - def _values_for_argsort(self): - return self._ndarray, self.isna() - def _values_for_factorize(self): return self._ndarray, -1 diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8f1da6dbe5c4a..0d775bdec5a7b 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -673,9 +673,6 @@ def _check_timedeltalike_freq_compat(self, other): _raise_on_incompatible(self, other) - def _values_for_argsort(self): - return self._data, self._isnan - PeriodArray._add_comparison_ops() diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index a3631576978a1..21c0c8f747b10 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -247,11 +247,12 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): mask = isna(items) cnt_null = mask.sum() sorted_idx = items.argsort(ascending=ascending, kind=kind) - - if ascending and na_position == 'first': - sorted_idx = np.roll(sorted_idx, cnt_null) - elif not ascending and na_position == 'last': + if ascending and na_position == 'last': + # NaN is coded as -1 and is listed in front after sorting sorted_idx = np.roll(sorted_idx, -cnt_null) + elif not ascending and na_position == 'first': + # NaN is coded as -1 and is listed in the end after sorting + sorted_idx = np.roll(sorted_idx, cnt_null) return sorted_idx with warnings.catch_warnings(): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index dde88a4838e6a..4b93f0e12e32a 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -171,7 +171,7 @@ def _concat_same_type(cls, to_concat): return cls(data) def _values_for_factorize(self): - frozen, _ = self._values_for_argsort() + frozen = self._values_for_argsort() if len(frozen) == 0: # _factorize_array expects 1-d array, this is a len-0 2-d array. frozen = frozen.ravel() @@ -182,7 +182,7 @@ def _values_for_argsort(self): # If all the elemnts of self are the same size P, NumPy will # cast them to an (N, P) array, instead of an (N,) array of tuples. frozen = [()] + [tuple(x.items()) for x in self] - return np.array(frozen, dtype=object)[1:], self.isna() + return np.array(frozen, dtype=object)[1:] def make_data(): From 16cff4e0cdf95c68d82425b4bb0dd78b23d99a68 Mon Sep 17 00:00:00 2001 From: makbigc Date: Thu, 6 Jun 2019 14:56:10 +0800 Subject: [PATCH 12/22] Fix lint error --- pandas/core/arrays/integer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 3b856323348e6..3c4ae83b31bdc 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,6 +1,6 @@ import copy import sys -from typing import Tuple, Type +from typing import Type import warnings import numpy as np From bc713cd4f1efd69a7ca20b263be40db261aaa511 Mon Sep 17 00:00:00 2001 From: makbigc Date: Thu, 6 Jun 2019 15:09:41 +0800 Subject: [PATCH 13/22] remove relundent lines added in rebase --- doc/source/whatsnew/v0.25.0.rst | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 47150e3e1d6a2..4f45e4554487a 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -82,8 +82,6 @@ Other Enhancements - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) - :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`) - :meth:`pandas.core.window.Rolling` supports exponential (or Poisson) window type (:issue:`21303`) -<<<<<<< HEAD -- :meth:`ExtensionArray.argsort` places NA values at the end of the sorted array (:issue:`21801`) .. _whatsnew_0250.api_breaking: @@ -435,18 +433,11 @@ Other API Changes - Comparing :class:`Timestamp` with unsupported objects now returns :py:obj:`NotImplemented` instead of raising ``TypeError``. This implies that unsupported rich comparisons are delegated to the other object, and are now consistent with Python 3 behavior for ``datetime`` objects (:issue:`24011`) - Bug in :meth:`DatetimeIndex.snap` which didn't preserving the ``name`` of the input :class:`Index` (:issue:`25575`) - The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) -<<<<<<< HEAD - The ``arg`` argument in :meth:`pandas.core.window._Window.aggregate` has been renamed to ``func`` (:issue:`26372`) - Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) -<<<<<<< HEAD - The ``.str``-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) - Removed support of gtk package for clipboards (:issue:`26563`) -======= -- :meth:`ExtensionArray.argsort` places the nan at the end of the sorted array (:issue:`21801`) ->>>>>>> Change docstring and move the entry in whatsnew -======= -- :meth:`ExtensionArray.argsort` places the ``nan`` at the end of the sorted array (:issue:`21801`) ->>>>>>> amend after review +- :meth:`ExtensionArray.argsort` places NA values at the end of the sorted array (:issue:`21801`) .. _whatsnew_0250.deprecations: From b10e5f9e9ae7971f813c3926ed153bb2d34ee1f3 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sat, 8 Jun 2019 17:18:22 +0800 Subject: [PATCH 14/22] Add test for NA-last behaviour --- pandas/core/arrays/base.py | 5 ++++- pandas/tests/extension/base/methods.py | 5 +++++ pandas/tests/extension/decimal/array.py | 6 ++++++ pandas/tests/extension/json/array.py | 5 ++++- pandas/tests/extension/test_categorical.py | 5 +++++ 5 files changed, 24 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 61ed936ace2a2..5610300f6f690 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -387,7 +387,10 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ # Implementor note: You have two places to override the behavior of # argsort. - # 1. _values_for_argsort : construct the values passed to np.argsort + # 1. _values_for_argsort : construct the values passed to np.argsort. + # The transformed values should maintain the ordering between values + # within the array. Preferably, the largest value is assiged to NA + # values in transformation. # 2. argsort : total control over sorting. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 1852edaa9e748..50fd4cd292641 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -44,6 +44,11 @@ def test_argsort_missing(self, data_missing_for_sorting): expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) self.assert_series_equal(result, expected) + def test_argsort_nan_last(self, data_missing_for_sorting): + # GH 21801 + result = data_missing_for_sorting.argsort() + assert result[-1] == 1 + @pytest.mark.parametrize('ascending', [True, False]) def test_sort_values(self, data_for_sorting, ascending): ser = pd.Series(data_for_sorting) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 1823eeb4d7fc0..ef0e44ec3c01c 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -153,6 +153,12 @@ def _reduce(self, name, skipna=True, **kwargs): "the {} operation".format(name)) return op(axis=0) + def _values_for_argsort(self): + data = self._data + mask = self.isna() + data[mask] = decimal.Decimal('Infinity') + return data + def to_decimal(values, context=None): return DecimalArray([decimal.Decimal(x) for x in values], context=context) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 4b93f0e12e32a..12713a8565228 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -182,7 +182,10 @@ def _values_for_argsort(self): # If all the elemnts of self are the same size P, NumPy will # cast them to an (N, P) array, instead of an (N,) array of tuples. frozen = [()] + [tuple(x.items()) for x in self] - return np.array(frozen, dtype=object)[1:] + data = np.array(frozen, dtype=object)[1:] + mask = self.isna() + data[mask] = np.nan + return data def make_data(): diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 4cf9f78e1531d..a169a08fa0d51 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -193,6 +193,11 @@ def test_searchsorted(self, data_for_sorting): if not data_for_sorting.ordered: raise pytest.skip(reason="searchsorted requires ordered data.") + def test_argsort_nan_last(self, data_missing_for_sorting): + # GH 21801 + # TODO: Categorical.argsort places NA values at the end + pass + class TestCasting(base.BaseCastingTests): pass From 7811a081ae162ab4ecb94390a7d6ce55b6b6fcd5 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 9 Jun 2019 12:32:13 +0800 Subject: [PATCH 15/22] Add _values_for_argsort in core/arrays/base.py --- pandas/core/arrays/base.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 5610300f6f690..6d0c907c22198 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -362,6 +362,21 @@ def isna(self) -> ArrayLike: """ raise AbstractMethodError(self) + def _values_for_argsort(self) -> np.ndarray: + """ + Return values for sorting. + Returns + ------- + ndarray + The transformed values should maintain the ordering between values + within the array. + See Also + -------- + ExtensionArray.argsort + """ + # Note: this is used in `ExtensionArray.argsort`. + return np.array(self) + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ Return the indices that would sort this array. @@ -394,10 +409,10 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): # 2. argsort : total control over sorting. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - if hasattr(self, '_values_for_argsort'): - values = self._values_for_argsort() - else: - values = self + #if hasattr(self, '_values_for_argsort'): + values = self._values_for_argsort() + #else: + #values = self na_position = 'last' if ascending else 'first' result = nargsort(values, kind=kind, ascending=ascending, na_position=na_position) From c172ba1f586a5ff644ebcc965c11f1b3fd8d74ba Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 9 Jun 2019 12:38:43 +0800 Subject: [PATCH 16/22] Fix PEP8 error --- pandas/core/arrays/base.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 6d0c907c22198..2063dfdfe1917 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -403,16 +403,10 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): # Implementor note: You have two places to override the behavior of # argsort. # 1. _values_for_argsort : construct the values passed to np.argsort. - # The transformed values should maintain the ordering between values - # within the array. Preferably, the largest value is assiged to NA - # values in transformation. # 2. argsort : total control over sorting. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - #if hasattr(self, '_values_for_argsort'): values = self._values_for_argsort() - #else: - #values = self na_position = 'last' if ascending else 'first' result = nargsort(values, kind=kind, ascending=ascending, na_position=na_position) From d9660b11f2acbb57552c92e4feca0c48e2daba12 Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 9 Jun 2019 12:59:34 +0800 Subject: [PATCH 17/22] Add one more test --- pandas/tests/test_sorting.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 75fa37eb9af09..fb23c8822a000 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -7,7 +7,8 @@ import pytest from pandas import ( - DataFrame, MultiIndex, Series, array, concat, merge, to_datetime) + Categorical, DataFrame, MultiIndex, Series, array, concat, merge, + to_datetime) from pandas.core import common as com from pandas.core.sorting import ( decons_group_index, get_group_index, is_int64_overflow_possible, @@ -188,6 +189,14 @@ def test_nargsort_datetimearray_warning(self): with tm.assert_produces_warning(None): nargsort(data) + @pytest.mark.parametrize('data', [ + Categorical(['a', 'c', 'a', 'b']), + to_datetime([0, 2, 0, 1]).tz_localize('Europe/Brussels')]) + def test_nargsort_extension_array(self, data): + result = nargsort(data) + expected = np.array([0, 2, 3, 1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + class TestMerge: From c333e8c2a133f440f014caefd48de702b712254b Mon Sep 17 00:00:00 2001 From: makbigc Date: Sun, 9 Jun 2019 22:17:51 +0800 Subject: [PATCH 18/22] Add _values_for_argsort back to numpy_.py for efficiency --- pandas/core/arrays/numpy_.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 92cf7e9425cd3..6ae464f169a13 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -283,6 +283,9 @@ def copy(self, deep=False): def _values_for_factorize(self): return self._ndarray, -1 + def _values_for_argsort(self): + return self._ndarray + def unique(self): from pandas import unique From 315e6f2a49d8dc14ad572a96dcb82727eeba0eff Mon Sep 17 00:00:00 2001 From: makbigc Date: Mon, 10 Jun 2019 10:53:40 +0800 Subject: [PATCH 19/22] Set na_position last and revert _values_for_argsort in datetimelike.py --- pandas/core/arrays/base.py | 3 +-- pandas/core/arrays/datetimelike.py | 3 +++ pandas/core/arrays/period.py | 3 +++ pandas/core/sorting.py | 2 +- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 2063dfdfe1917..38c4fe9b2c007 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -407,9 +407,8 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) values = self._values_for_argsort() - na_position = 'last' if ascending else 'first' result = nargsort(values, kind=kind, ascending=ascending, - na_position=na_position) + na_position='last') return result def fillna(self, value=None, method=None, limit=None): diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e1a7eb625a5c7..c32f8642dc2ed 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -620,6 +620,9 @@ def _values_for_factorize(self): def _from_factorized(cls, values, original): return cls(values, dtype=original.dtype) + def _values_for_argsort(self): + return self._data + # ------------------------------------------------------------------ # Additional array methods # These are not part of the EA API, but we implement them because diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 0d775bdec5a7b..dea8f3038ebd8 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -673,6 +673,9 @@ def _check_timedeltalike_freq_compat(self, other): _raise_on_incompatible(self, other) + def _values_for_argsort(self): + return self + PeriodArray._add_comparison_ops() diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 21c0c8f747b10..f4b71f69f481b 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -255,6 +255,7 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): sorted_idx = np.roll(sorted_idx, cnt_null) return sorted_idx + mask = isna(items) with warnings.catch_warnings(): # https://github.com/pandas-dev/pandas/issues/25439 # can be removed once ExtensionArrays are properly handled by nargsort @@ -263,7 +264,6 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): message="Converting timezone-aware DatetimeArray to") items = np.asanyarray(items) idx = np.arange(len(items)) - mask = isna(items) non_nans = items[~mask] non_nan_idx = idx[~mask] nan_idx = np.nonzero(mask)[0] From 0dc48a454e1445a4215cd5bcca483b6530140d77 Mon Sep 17 00:00:00 2001 From: makbigc Date: Fri, 14 Jun 2019 14:42:13 +0800 Subject: [PATCH 20/22] Add np.ndarray check to aviod double conversion --- pandas/core/arrays/base.py | 2 ++ pandas/core/arrays/integer.py | 13 +++++++++++++ pandas/core/arrays/numpy_.py | 6 +++--- pandas/core/sorting.py | 16 +++++++++------- 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 38c4fe9b2c007..088ce9e380174 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -365,11 +365,13 @@ def isna(self) -> ArrayLike: def _values_for_argsort(self) -> np.ndarray: """ Return values for sorting. + Returns ------- ndarray The transformed values should maintain the ordering between values within the array. + See Also -------- ExtensionArray.argsort diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 1dbc8cde595e5..d74975b6d29fd 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -501,6 +501,19 @@ def value_counts(self, dropna=True): return Series(array, index=index) + def _values_for_argsort(self) -> np.ndarray: + """Return values for sorting. + Returns + ------- + ndarray + The transformed values should maintain the ordering between values + within the array. + See Also + -------- + ExtensionArray.argsort + """ + return self + @classmethod def _create_comparison_method(cls, op): def cmp_method(self, other): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 6ae464f169a13..0deefd2b10b6e 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -280,12 +280,12 @@ def take(self, indices, allow_fill=False, fill_value=None): def copy(self, deep=False): return type(self)(self._ndarray.copy()) - def _values_for_factorize(self): - return self._ndarray, -1 - def _values_for_argsort(self): return self._ndarray + def _values_for_factorize(self): + return self._ndarray, -1 + def unique(self): from pandas import unique diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index f4b71f69f481b..4478166626661 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -256,13 +256,15 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): return sorted_idx mask = isna(items) - with warnings.catch_warnings(): - # https://github.com/pandas-dev/pandas/issues/25439 - # can be removed once ExtensionArrays are properly handled by nargsort - warnings.filterwarnings( - "ignore", category=FutureWarning, - message="Converting timezone-aware DatetimeArray to") - items = np.asanyarray(items) + if not isinstance(items, np.ndarray): + with warnings.catch_warnings(): + # https://github.com/pandas-dev/pandas/issues/25439 + # can be removed once ExtensionArrays are properly handled by + # nargsort + warnings.filterwarnings( + "ignore", category=FutureWarning, + message="Converting timezone-aware DatetimeArray to") + items = np.asanyarray(items) idx = np.arange(len(items)) non_nans = items[~mask] non_nan_idx = idx[~mask] From 7186d9dc378a15dddc1d1ddbdfdb5a3139f23568 Mon Sep 17 00:00:00 2001 From: makbigc Date: Fri, 14 Jun 2019 15:55:58 +0800 Subject: [PATCH 21/22] change whatsnew --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 273cab7003746..18dcaf1359fe3 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -459,7 +459,7 @@ Other API Changes - Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) - The ``.str``-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) - Removed support of gtk package for clipboards (:issue:`26563`) -- :meth:`ExtensionArray.argsort` places NA values at the end of the sorted array (:issue:`21801`) +- :meth:`ExtensionArray.argsort` places NA values at the end of the sorted array, except :class:`Categorical` (:issue:`21801`) .. _whatsnew_0250.deprecations: From 84f87a84b1295b6221302d465942e5b265c62f4a Mon Sep 17 00:00:00 2001 From: makbigc Date: Fri, 14 Jun 2019 16:09:48 +0800 Subject: [PATCH 22/22] Undo the addition --- pandas/core/arrays/integer.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index d74975b6d29fd..1dbc8cde595e5 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -501,19 +501,6 @@ def value_counts(self, dropna=True): return Series(array, index=index) - def _values_for_argsort(self) -> np.ndarray: - """Return values for sorting. - Returns - ------- - ndarray - The transformed values should maintain the ordering between values - within the array. - See Also - -------- - ExtensionArray.argsort - """ - return self - @classmethod def _create_comparison_method(cls, op): def cmp_method(self, other):