BUG: incorrect ranking in an ordered categorical

jeet63 · jreback · commit 3fe85afef47e · 2017-02-24T14:56:12.000-05:00
check for categorical, and then pass the underlying integer codes. closes #15420 Author: Prasanjit Prakash <jeet@gmail.com> Closes #15422 from ikilledthecat/rank_categorical and squashes the following commits: a7e573b [Prasanjit Prakash] moved test for categorical, in rank, to top 3ba4e3a [Prasanjit Prakash] corrections after rebasing c43a029 [Prasanjit Prakash] using if/else construct to pick sorting function for categoricals f8ec019 [Prasanjit Prakash] ask Categorical for ranking function 40d88c1 [Prasanjit Prakash] return values for rank from categorical object 049c0fc [Prasanjit Prakash] GH#15420 added support for na_option when ranking categorical 5e5bbeb [Prasanjit Prakash] BUG: GH#15420 rank for categoricals ef999c3 [Prasanjit Prakash] merged with upstream master fbaba1b [Prasanjit Prakash] return values for rank from categorical object fa0b4c2 [Prasanjit Prakash] BUG: GH15420 - _rank private method on Categorical 9a6b5cd [Prasanjit Prakash] BUG: GH15420 - _rank private method on Categorical 4220e56 [Prasanjit Prakash] BUG: GH15420 - _rank private method on Categorical 6b70921 [Prasanjit Prakash] GH#15420 move rank inside categoricals bf4e36c [Prasanjit Prakash] GH#15420 added support for na_option when ranking categorical ce90207 [Prasanjit Prakash] BUG: GH#15420 rank for categoricals 85b267a [Prasanjit Prakash] Added support for categorical datatype in rank - issue#15420
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -578,6 +578,7 @@ Bug Fixes
 
 
 
+- Bug in ``.rank()`` which incorrectly ranks ordered categories (:issue:`15420`)
 
 
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -973,6 +973,10 @@ def _hashtable_algo(f, values, return_dtype=None):
 def _get_data_algo(values, func_map):
 
     f = None
+
+    if is_categorical_dtype(values):
+        values = values._values_for_rank()
+
     if is_float_dtype(values):
         f = func_map['float64']
         values = _ensure_float64(values)
@@ -988,7 +992,6 @@ def _get_data_algo(values, func_map):
     elif is_unsigned_integer_dtype(values):
         f = func_map['uint64']
         values = _ensure_uint64(values)
-
     else:
         values = _ensure_object(values)
 
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -1404,6 +1404,28 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
             return self._constructor(values=codes, categories=self.categories,
                                      ordered=self.ordered, fastpath=True)
 
+    def _values_for_rank(self):
+        """
+        For correctly ranking ordered categorical data. See GH#15420
+
+        Ordered categorical data should be ranked on the basis of
+        codes with -1 translated to NaN.
+
+        Returns
+        -------
+        numpy array
+
+        """
+        if self.ordered:
+            values = self.codes
+            mask = values == -1
+            if mask.any():
+                values = values.astype('float64')
+                values[mask] = np.nan
+        else:
+            values = np.array(self)
+        return values
+
     def order(self, inplace=False, ascending=True, na_position='last'):
         """
         DEPRECATED: use :meth:`Categorical.sort_values`. That function
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
@@ -1057,6 +1057,84 @@ def test_rank(self):
         iranks = iseries.rank()
         assert_series_equal(iranks, exp)
 
+    def test_rank_categorical(self):
+        # GH issue #15420 rank incorrectly orders ordered categories
+
+        # Test ascending/descending ranking for ordered categoricals
+        exp = pd.Series([1., 2., 3., 4., 5., 6.])
+        exp_desc = pd.Series([6., 5., 4., 3., 2., 1.])
+        ordered = pd.Series(
+            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth']
+        ).astype('category', ).cat.set_categories(
+            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
+            ordered=True
+        )
+        assert_series_equal(ordered.rank(), exp)
+        assert_series_equal(ordered.rank(ascending=False), exp_desc)
+
+        # Unordered categoricals should be ranked as objects
+        unordered = pd.Series(
+            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
+        ).astype('category').cat.set_categories(
+            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
+            ordered=False
+        )
+        exp_unordered = pd.Series([2., 4., 6., 3., 1., 5.])
+        res = unordered.rank()
+        assert_series_equal(res, exp_unordered)
+
+        # Test na_option for rank data
+        na_ser = pd.Series(
+            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN]
+        ).astype('category', ).cat.set_categories(
+            [
+                'first', 'second', 'third', 'fourth',
+                'fifth', 'sixth', 'seventh'
+            ],
+            ordered=True
+        )
+
+        exp_top = pd.Series([2., 3., 4., 5., 6., 7., 1.])
+        exp_bot = pd.Series([1., 2., 3., 4., 5., 6., 7.])
+        exp_keep = pd.Series([1., 2., 3., 4., 5., 6., np.NaN])
+
+        assert_series_equal(na_ser.rank(na_option='top'), exp_top)
+        assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot)
+        assert_series_equal(na_ser.rank(na_option='keep'), exp_keep)
+
+        # Test na_option for rank data with ascending False
+        exp_top = pd.Series([7., 6., 5., 4., 3., 2., 1.])
+        exp_bot = pd.Series([6., 5., 4., 3., 2., 1., 7.])
+        exp_keep = pd.Series([6., 5., 4., 3., 2., 1., np.NaN])
+
+        assert_series_equal(
+            na_ser.rank(na_option='top', ascending=False),
+            exp_top
+        )
+        assert_series_equal(
+            na_ser.rank(na_option='bottom', ascending=False),
+            exp_bot
+        )
+        assert_series_equal(
+            na_ser.rank(na_option='keep', ascending=False),
+            exp_keep
+        )
+
+        # Test with pct=True
+        na_ser = pd.Series(
+            ['first', 'second', 'third', 'fourth', np.NaN],
+        ).astype('category').cat.set_categories(
+            ['first', 'second', 'third', 'fourth'],
+            ordered=True
+        )
+        exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2])
+        exp_bot = pd.Series([0.2, 0.4, 0.6, 0.8, 1.])
+        exp_keep = pd.Series([0.25, 0.5, 0.75, 1., np.NaN])
+
+        assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top)
+        assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot)
+        assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep)
+
     def test_rank_signature(self):
         s = Series([0, 1])
         s.rank(method='average')

Original file line number	Diff line number	Diff line change
`@@ -578,6 +578,7 @@ Bug Fixes`
`578`	`578`
`579`	`579`
`580`	`580`
	`581`	+- Bug in ``.rank()`` which incorrectly ranks ordered categories (:issue:`15420`)
`581`	`582`
`582`	`583`
`583`	`584`