pandas-dev · jreback · Jul 3, 2019 · Jun 29, 2019 · Jun 30, 2019 · Jul 3, 2019
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -141,4 +141,16 @@ def time_quantile(self, quantile, interpolation, dtype):
         self.idx.quantile(quantile, interpolation=interpolation)
 
 
+class SortIntegerArray:
+    params = [10**3, 10**5]
+
+    def setup(self, N):
+        data = np.arange(N, dtype=float)
+        data[40] = np.nan
+        self.array = pd.array(data, dtype='Int64')
+
+    def time_argsort(self, N):
+        self.array.argsort()
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -661,7 +661,7 @@ when both are :class:`Series` (:issue:`23293`).
 
 *Previous behavior*
 
-.. code-block:: python
+.. code-block:: ipython
 
    In [5]: np.power(s1, s2)
    Out[5]:
@@ -684,6 +684,36 @@ applying the ufunc.
 
    np.power(s1, s2.array)
 
+Categorical.argsort now places missing values at the end
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:meth:`Categorical.argsort` now places missing values at the end of the array, making it
+consistent with NumPy and the rest of pandas (:issue:`21801`).
+
+.. ipython:: python
+
+   cat = pd.Categorical(['b', None, 'a'], categories=['a', 'b'], ordered=True)
+
+*Previous behavior*
+
+.. code-block:: ipython
+
+   In [2]: cat = pd.Categorical(['b', None, 'a'], categories=['a', 'b'], ordered=True)
+
+   In [3]: cat.argsort()
+   Out[3]: array([1, 2, 0])
+
+   In [4]: cat[cat.argsort()]
+   Out[4]:
+   [NaN, a, b]
+   categories (2, object): [a < b]
+
+*New behavior*
+
+.. ipython:: python
+
+   cat.argsort()
+   cat[cat.argsort()]
 
 .. _whatsnew_0250.api_breaking.deps:
 
@@ -767,6 +797,7 @@ Other API changes
 - Removed support of gtk package for clipboards (:issue:`26563`)
 - Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`)
 - :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` will now raise a ``ValueError`` when saving timezone aware data. (:issue:`27008`, :issue:`7056`)
+- :meth:`ExtensionArray.argsort` places NA values at the end of the sorted array. (:issue:`21801`)
 - :meth:`DataFrame.to_hdf` and :meth:`Series.to_hdf` will now raise a ``NotImplementedError`` when saving a :class:`MultiIndex` with extention data types for a ``fixed`` format. (:issue:`7775`)
 - Passing duplicate ``names`` in :meth:`read_csv` will now raise a ``ValueError`` (:issue:`17346`)
 

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -23,6 +23,7 @@
 
 from pandas._typing import ArrayLike
 from pandas.core import ops
+from pandas.core.sorting import nargsort
 
 _not_implemented_message = "{} does not implement {}."
 
@@ -409,7 +410,8 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
         Returns
         -------
         index_array : ndarray
-            Array of indices that sort ``self``.
+            Array of indices that sort ``self``. If NaN values are contained,
+            NaN values are placed at the end.
 
         See Also
         --------
@@ -420,10 +422,9 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
         # 1. _values_for_argsort : construct the values passed to np.argsort
         # 2. argsort : total control over sorting.
         ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs)
-        values = self._values_for_argsort()
-        result = np.argsort(values, kind=kind, **kwargs)
-        if not ascending:
-            result = result[::-1]
+
+        result = nargsort(self, kind=kind, ascending=ascending,
+                          na_position='last')
         return result
 
     def fillna(self, value=None, method=None, limit=None):

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1531,13 +1531,14 @@ def check_for_ordered(self, op):
     def _values_for_argsort(self):
         return self._codes.copy()
 
-    def argsort(self, *args, **kwargs):
-        # TODO(PY2): use correct signature
-        # We have to do *args, **kwargs to avoid a a py2-only signature
-        # issue since np.argsort differs from argsort.
+    def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
         """
         Return the indices that would sort the Categorical.
 
+        .. versionchanged:: 0.25.0
+
+           Changed to sort missing values at the end.
+
         Parameters
         ----------
         ascending : bool, default True
@@ -1574,9 +1575,14 @@ def argsort(self, *args, **kwargs):
         ...                      ordered=True)
         >>> cat.argsort()
         array([3, 0, 1, 2])
+
+        Missing values are placed at the end
+
+        >>> cat = pd.Categorical([2, None, 1])
+        >>> cat.argsort()
+        array([2, 0, 1])
         """
-        # Keep the implementation here just for the docstring.
-        return super().argsort(*args, **kwargs)
+        return super().argsort(ascending=ascending, kind=kind, *args, **kwargs)
 
     def sort_values(self, inplace=False, ascending=True, na_position='last'):
         """

diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
@@ -240,20 +240,6 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'):
 
     items = extract_array(items)
     mask = np.asarray(isna(items))
-    # specially handle Categorical
-    if is_categorical_dtype(items):
-        if na_position not in {'first', 'last'}:
-            raise ValueError('invalid na_position: {!r}'.format(na_position))
-
-        cnt_null = mask.sum()
-        sorted_idx = items.argsort(ascending=ascending, kind=kind)
-        if ascending and na_position == 'last':
-            # NaN is coded as -1 and is listed in front after sorting
-            sorted_idx = np.roll(sorted_idx, -cnt_null)
-        elif not ascending and na_position == 'first':
-            # NaN is coded as -1 and is listed in the end after sorting
-            sorted_idx = np.roll(sorted_idx, cnt_null)
-        return sorted_idx
 
     if is_extension_array_dtype(items):
         items = items._values_for_argsort()

diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -47,6 +47,14 @@ def test_argsort(self, data_for_sorting):
         expected = pd.Series(np.array([2, 0, 1], dtype=np.int64))
         self.assert_series_equal(result, expected)
 
+    def test_argsort_missing_array(self, data_missing_for_sorting):
+        result = data_missing_for_sorting.argsort()
+        expected = np.array([2, 0, 1], dtype=np.dtype("int"))
+        # we don't care whether it's int32 or int64
+        result = result.astype("int64", casting="safe")
+        expected = expected.astype("int64", casting="safe")
+        tm.assert_numpy_array_equal(result, expected)
+
     def test_argsort_missing(self, data_missing_for_sorting):
         result = pd.Series(data_missing_for_sorting).argsort()
         expected = pd.Series(np.array([1, -1, 0], dtype=np.int64))

diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py
@@ -539,7 +539,7 @@ def test_sort_index_categorical_index(self):
         assert_frame_equal(result, expected)
 
         result = df.sort_index(ascending=False)
-        expected = df.iloc[[3, 2, 5, 1, 0, 4]]
+        expected = df.iloc[[2, 3, 0, 1, 5, 4]]
         assert_frame_equal(result, expected)
 
     def test_sort_index(self):
@@ -629,7 +629,7 @@ def test_sort_index_na_position_with_categories(self):
 
         reversed_categories = sorted(categories, reverse=True)
         reversed_category_indices = sorted(category_indices, reverse=True)
-        reversed_na_indices = sorted(na_indices, reverse=True)
+        reversed_na_indices = sorted(na_indices)
 
         df = pd.DataFrame({
             column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'],