pandas-dev · jeetjitsu · Feb 16, 2017 · Feb 16, 2017 · Feb 16, 2017 · Feb 17, 2017
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -578,6 +578,7 @@ Bug Fixes
 
 
 
+- Bug in ``.rank()`` which incorrectly ranks ordered categories (:issue:`15420`)
 
 
 

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -973,6 +973,10 @@ def _hashtable_algo(f, values, return_dtype=None):
 def _get_data_algo(values, func_map):
 
     f = None
+
+    if is_categorical_dtype(values):
+        values = values._values_for_rank()
+
     if is_float_dtype(values):
         f = func_map['float64']
         values = _ensure_float64(values)
@@ -988,7 +992,6 @@ def _get_data_algo(values, func_map):
     elif is_unsigned_integer_dtype(values):
         f = func_map['uint64']
         values = _ensure_uint64(values)
-
     else:
         values = _ensure_object(values)
 

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -1404,6 +1404,28 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
             return self._constructor(values=codes, categories=self.categories,
                                      ordered=self.ordered, fastpath=True)
 
+    def _values_for_rank(self):
+        """
+        For correctly ranking ordered categorical data. See GH#15420
+
+        Ordered categorical data should be ranked on the basis of
+        codes with -1 translated to NaN.
+
+        Returns
+        -------
+        numpy array
+
+        """
+        if self.ordered:
+            values = self.codes
+            mask = values == -1
+            if mask.any():
+                values = values.astype('float64')
+                values[mask] = np.nan
+        else:
+            values = np.array(self)
+        return values
+
     def order(self, inplace=False, ascending=True, na_position='last'):
         """
         DEPRECATED: use :meth:`Categorical.sort_values`. That function

diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
@@ -1057,6 +1057,84 @@ def test_rank(self):
         iranks = iseries.rank()
         assert_series_equal(iranks, exp)
 
+    def test_rank_categorical(self):
+        # GH issue #15420 rank incorrectly orders ordered categories
+
+        # Test ascending/descending ranking for ordered categoricals
+        exp = pd.Series([1., 2., 3., 4., 5., 6.])
+        exp_desc = pd.Series([6., 5., 4., 3., 2., 1.])
+        ordered = pd.Series(
+            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth']
+        ).astype('category', ).cat.set_categories(
+            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
+            ordered=True
+        )
+        assert_series_equal(ordered.rank(), exp)
+        assert_series_equal(ordered.rank(ascending=False), exp_desc)
+
+        # Unordered categoricals should be ranked as objects
+        unordered = pd.Series(
+            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
+        ).astype('category').cat.set_categories(
+            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
+            ordered=False
+        )
+        exp_unordered = pd.Series([2., 4., 6., 3., 1., 5.])
+        res = unordered.rank()
+        assert_series_equal(res, exp_unordered)
+
+        # Test na_option for rank data
+        na_ser = pd.Series(
+            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN]
+        ).astype('category', ).cat.set_categories(
+            [
+                'first', 'second', 'third', 'fourth',
+                'fifth', 'sixth', 'seventh'
+            ],
+            ordered=True
+        )
+
+        exp_top = pd.Series([2., 3., 4., 5., 6., 7., 1.])
+        exp_bot = pd.Series([1., 2., 3., 4., 5., 6., 7.])
+        exp_keep = pd.Series([1., 2., 3., 4., 5., 6., np.NaN])
+
+        assert_series_equal(na_ser.rank(na_option='top'), exp_top)
+        assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot)
+        assert_series_equal(na_ser.rank(na_option='keep'), exp_keep)
+
+        # Test na_option for rank data with ascending False
+        exp_top = pd.Series([7., 6., 5., 4., 3., 2., 1.])
+        exp_bot = pd.Series([6., 5., 4., 3., 2., 1., 7.])
+        exp_keep = pd.Series([6., 5., 4., 3., 2., 1., np.NaN])
+
+        assert_series_equal(
+            na_ser.rank(na_option='top', ascending=False),
+            exp_top
+        )
+        assert_series_equal(
+            na_ser.rank(na_option='bottom', ascending=False),
+            exp_bot
+        )
+        assert_series_equal(
+            na_ser.rank(na_option='keep', ascending=False),
+            exp_keep
+        )
+
+        # Test with pct=True
+        na_ser = pd.Series(
+            ['first', 'second', 'third', 'fourth', np.NaN],
+        ).astype('category').cat.set_categories(
+            ['first', 'second', 'third', 'fourth'],
+            ordered=True
+        )
+        exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2])
+        exp_bot = pd.Series([0.2, 0.4, 0.6, 0.8, 1.])
+        exp_keep = pd.Series([0.25, 0.5, 0.75, 1., np.NaN])
+
+        assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top)
+        assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot)
+        assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep)
+
     def test_rank_signature(self):
         s = Series([0, 1])
         s.rank(method='average')
Original file line number	Diff line number	Diff line change
Expand Up		@@ -578,6 +578,7 @@ Bug Fixes



		- Bug in ``.rank()`` which incorrectly ranks ordered categories (:issue:`15420`)



Expand Down