BUG: df.sort_values() not respecting na_position with categoricals #22556 (#22640)

staftermath · jreback · commit 32ef84b50542 · 2018-10-18T12:07:25.000-04:00
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -791,6 +791,7 @@ Categorical
 ^^^^^^^^^^^
 
 - Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``.
+- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`).
 - Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
 - Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
 
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -45,6 +45,8 @@
 
 import pandas.core.algorithms as algorithms
 
+from pandas.core.sorting import nargsort
+
 from pandas.io.formats import console
 from pandas.io.formats.terminal import get_terminal_size
 from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
@@ -1605,32 +1607,15 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
             msg = 'invalid na_position: {na_position!r}'
             raise ValueError(msg.format(na_position=na_position))
 
-        codes = np.sort(self._codes)
-        if not ascending:
-            codes = codes[::-1]
-
-        # NaN handling
-        na_mask = (codes == -1)
-        if na_mask.any():
-            n_nans = len(codes[na_mask])
-            if na_position == "first":
-                # in this case sort to the front
-                new_codes = codes.copy()
-                new_codes[0:n_nans] = -1
-                new_codes[n_nans:] = codes[~na_mask]
-                codes = new_codes
-            elif na_position == "last":
-                # ... and to the end
-                new_codes = codes.copy()
-                pos = len(codes) - n_nans
-                new_codes[0:pos] = codes[~na_mask]
-                new_codes[pos:] = -1
-                codes = new_codes
+        sorted_idx = nargsort(self,
+                              ascending=ascending,
+                              na_position=na_position)
+
         if inplace:
-            self._codes = codes
-            return
+            self._codes = self._codes[sorted_idx]
         else:
-            return self._constructor(values=codes, dtype=self.dtype,
+            return self._constructor(values=self._codes[sorted_idx],
+                                     dtype=self.dtype,
                                      fastpath=True)
 
     def _values_for_rank(self):
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
@@ -241,7 +241,19 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'):
 
     # specially handle Categorical
     if is_categorical_dtype(items):
-        return items.argsort(ascending=ascending, kind=kind)
+        if na_position not in {'first', 'last'}:
+            raise ValueError('invalid na_position: {!r}'.format(na_position))
+
+        mask = isna(items)
+        cnt_null = mask.sum()
+        sorted_idx = items.argsort(ascending=ascending, kind=kind)
+        if ascending and na_position == 'last':
+            # NaN is coded as -1 and is listed in front after sorting
+            sorted_idx = np.roll(sorted_idx, -cnt_null)
+        elif not ascending and na_position == 'first':
+            # NaN is coded as -1 and is listed in the end after sorting
+            sorted_idx = np.roll(sorted_idx, cnt_null)
+        return sorted_idx
 
     items = np.asanyarray(items)
     idx = np.arange(len(items))
diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py
@@ -10,7 +10,7 @@
 from pandas.compat import lrange
 from pandas.api.types import CategoricalDtype
 from pandas import (DataFrame, Series, MultiIndex, Timestamp,
-                    date_range, NaT, IntervalIndex)
+                    date_range, NaT, IntervalIndex, Categorical)
 
 from pandas.util.testing import assert_series_equal, assert_frame_equal
 
@@ -161,7 +161,7 @@ def test_sort_nan(self):
              'B': [5, 9, 2, nan, 5, 5, 4]},
             index=[2, 0, 3, 1, 6, 4, 5])
         sorted_df = df.sort_values(['A', 'B'], ascending=[
-                                   1, 0], na_position='first')
+            1, 0], na_position='first')
         assert_frame_equal(sorted_df, expected)
 
         # na_position='last', not order
@@ -170,7 +170,7 @@ def test_sort_nan(self):
              'B': [4, 5, 5, nan, 2, 9, 5]},
             index=[5, 4, 6, 1, 3, 0, 2])
         sorted_df = df.sort_values(['A', 'B'], ascending=[
-                                   0, 1], na_position='last')
+            0, 1], na_position='last')
         assert_frame_equal(sorted_df, expected)
 
         # Test DataFrame with nan label
@@ -514,7 +514,7 @@ def test_sort_index_categorical_index(self):
 
         df = (DataFrame({'A': np.arange(6, dtype='int64'),
                          'B': Series(list('aabbca'))
-                         .astype(CategoricalDtype(list('cab')))})
+                        .astype(CategoricalDtype(list('cab')))})
               .set_index('B'))
 
         result = df.sort_index()
@@ -598,3 +598,81 @@ def test_sort_index_intervalindex(self):
             closed='right')
         result = result.columns.levels[1].categories
         tm.assert_index_equal(result, expected)
+
+    def test_sort_index_na_position_with_categories(self):
+        # GH 22556
+        # Positioning missing value properly when column is Categorical.
+        categories = ['A', 'B', 'C']
+        category_indices = [0, 2, 4]
+        list_of_nans = [np.nan, np.nan]
+        na_indices = [1, 3]
+        na_position_first = 'first'
+        na_position_last = 'last'
+        column_name = 'c'
+
+        reversed_categories = sorted(categories, reverse=True)
+        reversed_category_indices = sorted(category_indices, reverse=True)
+        reversed_na_indices = sorted(na_indices, reverse=True)
+
+        df = pd.DataFrame({
+            column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'],
+                                        categories=categories,
+                                        ordered=True)})
+        # sort ascending with na first
+        result = df.sort_values(by=column_name,
+                                ascending=True,
+                                na_position=na_position_first)
+        expected = DataFrame({
+            column_name: Categorical(list_of_nans + categories,
+                                     categories=categories,
+                                     ordered=True)
+        }, index=na_indices + category_indices)
+
+        assert_frame_equal(result, expected)
+
+        # sort ascending with na last
+        result = df.sort_values(by=column_name,
+                                ascending=True,
+                                na_position=na_position_last)
+        expected = DataFrame({
+            column_name: Categorical(categories + list_of_nans,
+                                     categories=categories,
+                                     ordered=True)
+        }, index=category_indices + na_indices)
+
+        assert_frame_equal(result, expected)
+
+        # sort descending with na first
+        result = df.sort_values(by=column_name,
+                                ascending=False,
+                                na_position=na_position_first)
+        expected = DataFrame({
+            column_name: Categorical(list_of_nans + reversed_categories,
+                                     categories=categories,
+                                     ordered=True)
+        }, index=reversed_na_indices + reversed_category_indices)
+
+        assert_frame_equal(result, expected)
+
+        # sort descending with na last
+        result = df.sort_values(by=column_name,
+                                ascending=False,
+                                na_position=na_position_last)
+        expected = DataFrame({
+            column_name: Categorical(reversed_categories + list_of_nans,
+                                     categories=categories,
+                                     ordered=True)
+        }, index=reversed_category_indices + reversed_na_indices)
+
+        assert_frame_equal(result, expected)
+
+    def test_sort_index_na_position_with_categories_raises(self):
+        df = pd.DataFrame({
+            'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'],
+                                categories=['A', 'B', 'C'],
+                                ordered=True)})
+
+        with pytest.raises(ValueError):
+            df.sort_values(by='c',
+                           ascending=False,
+                           na_position='bad_position')
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -995,7 +995,7 @@ def test_categorical_sorting(self, file):
         parsed = read_stata(getattr(self, file))
 
         # Sort based on codes, not strings
-        parsed = parsed.sort_values("srh")
+        parsed = parsed.sort_values("srh", na_position='first')
 
         # Don't sort index
         parsed.index = np.arange(parsed.shape[0])