Skip to content

API: ExtensionArray.argsort places the missing value at the end #27137

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jul 3, 2019
12 changes: 12 additions & 0 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,4 +141,16 @@ def time_quantile(self, quantile, interpolation, dtype):
self.idx.quantile(quantile, interpolation=interpolation)


class SortIntegerArray:
params = [10**3, 10**5]

def setup(self, N):
data = np.arange(N, dtype=float)
data[40] = np.nan
self.array = pd.array(data, dtype='Int64')

def time_argsort(self, N):
self.array.argsort()


from .pandas_vb_common import setup # noqa: F401 isort:skip
33 changes: 32 additions & 1 deletion doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -661,7 +661,7 @@ when both are :class:`Series` (:issue:`23293`).

*Previous behavior*

.. code-block:: python
.. code-block:: ipython

In [5]: np.power(s1, s2)
Out[5]:
Expand All @@ -684,6 +684,36 @@ applying the ufunc.

np.power(s1, s2.array)

Categorical.argsort now places missing values at the end
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

:meth:`Categorical.argsort` now places missing values at the end of the array, making it
consistent with NumPy and the rest of pandas (:issue:`21801`).

.. ipython:: python

cat = pd.Categorical(['b', None, 'a'], categories=['a', 'b'], ordered=True)

*Previous behavior*

.. code-block:: ipython

In [2]: cat = pd.Categorical(['b', None, 'a'], categories=['a', 'b'], ordered=True)

In [3]: cat.argsort()
Out[3]: array([1, 2, 0])

In [4]: cat[cat.argsort()]
Out[4]:
[NaN, a, b]
categories (2, object): [a < b]

*New behavior*

.. ipython:: python

cat.argsort()
cat[cat.argsort()]

.. _whatsnew_0250.api_breaking.deps:

Expand Down Expand Up @@ -767,6 +797,7 @@ Other API changes
- Removed support of gtk package for clipboards (:issue:`26563`)
- Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`)
- :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` will now raise a ``ValueError`` when saving timezone aware data. (:issue:`27008`, :issue:`7056`)
- :meth:`ExtensionArray.argsort` places NA values at the end of the sorted array. (:issue:`21801`)
- :meth:`DataFrame.to_hdf` and :meth:`Series.to_hdf` will now raise a ``NotImplementedError`` when saving a :class:`MultiIndex` with extention data types for a ``fixed`` format. (:issue:`7775`)
- Passing duplicate ``names`` in :meth:`read_csv` will now raise a ``ValueError`` (:issue:`17346`)

Expand Down
11 changes: 6 additions & 5 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

from pandas._typing import ArrayLike
from pandas.core import ops
from pandas.core.sorting import nargsort

_not_implemented_message = "{} does not implement {}."

Expand Down Expand Up @@ -409,7 +410,8 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
Returns
-------
index_array : ndarray
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so we aren't specific that these are actually intp (and NOT int64's)

Array of indices that sort ``self``.
Array of indices that sort ``self``. If NaN values are contained,
NaN values are placed at the end.

See Also
--------
Expand All @@ -420,10 +422,9 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
# 1. _values_for_argsort : construct the values passed to np.argsort
# 2. argsort : total control over sorting.
ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs)
values = self._values_for_argsort()
result = np.argsort(values, kind=kind, **kwargs)
if not ascending:
result = result[::-1]

result = nargsort(self, kind=kind, ascending=ascending,
na_position='last')
return result

def fillna(self, value=None, method=None, limit=None):
Expand Down
18 changes: 12 additions & 6 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1531,13 +1531,14 @@ def check_for_ordered(self, op):
def _values_for_argsort(self):
return self._codes.copy()

def argsort(self, *args, **kwargs):
# TODO(PY2): use correct signature
# We have to do *args, **kwargs to avoid a a py2-only signature
# issue since np.argsort differs from argsort.
def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
"""
Return the indices that would sort the Categorical.

.. versionchanged:: 0.25.0

Changed to sort missing values at the end.

Parameters
----------
ascending : bool, default True
Expand Down Expand Up @@ -1574,9 +1575,14 @@ def argsort(self, *args, **kwargs):
... ordered=True)
>>> cat.argsort()
array([3, 0, 1, 2])

Missing values are placed at the end

>>> cat = pd.Categorical([2, None, 1])
>>> cat.argsort()
array([2, 0, 1])
"""
# Keep the implementation here just for the docstring.
return super().argsort(*args, **kwargs)
return super().argsort(ascending=ascending, kind=kind, *args, **kwargs)

def sort_values(self, inplace=False, ascending=True, na_position='last'):
"""
Expand Down
14 changes: 0 additions & 14 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,20 +240,6 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'):

items = extract_array(items)
mask = np.asarray(isna(items))
# specially handle Categorical
if is_categorical_dtype(items):
if na_position not in {'first', 'last'}:
raise ValueError('invalid na_position: {!r}'.format(na_position))

cnt_null = mask.sum()
sorted_idx = items.argsort(ascending=ascending, kind=kind)
if ascending and na_position == 'last':
# NaN is coded as -1 and is listed in front after sorting
sorted_idx = np.roll(sorted_idx, -cnt_null)
elif not ascending and na_position == 'first':
# NaN is coded as -1 and is listed in the end after sorting
sorted_idx = np.roll(sorted_idx, cnt_null)
return sorted_idx

if is_extension_array_dtype(items):
items = items._values_for_argsort()
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ def test_argsort(self, data_for_sorting):
expected = pd.Series(np.array([2, 0, 1], dtype=np.int64))
self.assert_series_equal(result, expected)

def test_argsort_missing_array(self, data_missing_for_sorting):
result = data_missing_for_sorting.argsort()
expected = np.array([2, 0, 1], dtype=np.dtype("int"))
# we don't care whether it's int32 or int64
result = result.astype("int64", casting="safe")
expected = expected.astype("int64", casting="safe")
tm.assert_numpy_array_equal(result, expected)

def test_argsort_missing(self, data_missing_for_sorting):
result = pd.Series(data_missing_for_sorting).argsort()
expected = pd.Series(np.array([1, -1, 0], dtype=np.int64))
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/test_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,7 +539,7 @@ def test_sort_index_categorical_index(self):
assert_frame_equal(result, expected)

result = df.sort_index(ascending=False)
expected = df.iloc[[3, 2, 5, 1, 0, 4]]
expected = df.iloc[[2, 3, 0, 1, 5, 4]]
assert_frame_equal(result, expected)

def test_sort_index(self):
Expand Down Expand Up @@ -629,7 +629,7 @@ def test_sort_index_na_position_with_categories(self):

reversed_categories = sorted(categories, reverse=True)
reversed_category_indices = sorted(category_indices, reverse=True)
reversed_na_indices = sorted(na_indices, reverse=True)
reversed_na_indices = sorted(na_indices)

df = pd.DataFrame({
column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'],
Expand Down